def test_update_envs_env_update(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(PointEnv) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env) rollouts = sampler.obtain_samples(0, 161, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers)) mean_rewards = [] goals = [] for rollout in rollouts.split(): mean_rewards.append(rollout.rewards.mean()) goals.append(rollout.env_infos['task'][0]['goal']) assert np.var(mean_rewards) > 0 assert np.var(goals) > 0 with pytest.raises(ValueError): sampler.obtain_samples(0, 10, np.asarray(policy.get_param_values()), env_update=tasks.sample(n_workers + 1)) sampler.shutdown_worker() env.close()
def test_cem_cartpole(self): """Test CEM with Cartpole-v1 environment.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(env_name='CartPole-v1') policy = CategoricalMLPPolicy(name='policy', env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) n_samples = 10 algo = CEM(env_spec=env.spec, policy=policy, baseline=baseline, best_frac=0.1, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) rtn = runner.train(n_epochs=10, batch_size=2048, n_epoch_cycles=n_samples) assert rtn > 40 env.close()
def test_obtain_exact_trajectories(): max_path_length = 15 n_workers = 8 env = TfEnv(PointEnv()) per_worker_actions = [env.action_space.sample() for _ in range(n_workers)] policies = [ FixedPolicy(env.spec, [action] * max_path_length) for action in per_worker_actions ] workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory(workers, policies, envs=env) n_traj_per_worker = 3 rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker, agent_update=policies) # At least one action per trajectory. assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker # All of the trajectories. assert len(rollouts.lengths) == n_workers * n_traj_per_worker worker = -1 for count, rollout in enumerate(rollouts.split()): if count % n_traj_per_worker == 0: worker += 1 assert (rollout.actions == per_worker_actions[worker]).all() sampler.shutdown_worker() env.close()
def test_init_with_crashed_worker(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: TfEnv(PointEnv())) n_workers = 2 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) class CrashingPolicy: def reset(self, **kwargs): raise Exception('Intentional subprocess crash') bad_policy = CrashingPolicy() # This causes worker 2 to crash. sampler = MultiprocessingSampler.from_worker_factory( workers, [policy, bad_policy], envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, None) assert sum(rollouts.lengths) >= 160 sampler.shutdown_worker() env.close()
def test_ppo_pendulum_recurrent(self): """Test PPO with Pendulum environment and recurrent policy.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40 env.close()
def test_ddpg_pendulum(self): """Test DDPG with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(gym.make('InvertedDoublePendulum-v2')) action_noise = OUStrategy(env.spec, sigma=0.2) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, target_update_tau=1e-2, n_train_steps=50, discount=0.9, min_buffer_size=int(1e4), exploration_strategy=action_noise, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, n_epoch_cycles=20, batch_size=100) assert last_avg_ret > 60 env.close()
def test_npo_pendulum(self): """Test NPO with Pendulum environment.""" with LocalRunner(self.sess) as runner: logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = NPO( env=env, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, policy_ent_coeff=0.0) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 20 env.close()
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaselineWithModel( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 30 env.close()
def test_ppo_pendulum_recurrent_continuous_baseline(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalTFRunner(snapshot_config) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = ContinuousMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, optimizer_args=dict( batch_size=32, max_epochs=10, ), stop_entropy_gradient=True, entropy_method='max', policy_ent_coeff=0.02, center_adv=False, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 100 env.close()
def test_trpo_cnn_cubecrash(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CubeCrash-v0'))) policy = CategoricalCNNPolicy(env_spec=env.spec, conv_filters=(32, 64), conv_filter_sizes=(8, 4), conv_strides=(4, 2), conv_pad='VALID', hidden_sizes=(32, 32)) baseline = GaussianCNNBaseline(env_spec=env.spec, regressor_args=dict( num_filters=(32, 64), filter_dims=(8, 4), strides=(4, 2), padding='VALID', hidden_sizes=(32, 32), use_trust_region=True)) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.98, max_kl_step=0.01, policy_ent_coeff=0.0, flatten_input=False) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > -0.9 env.close()
def test_cma_es_cartpole(self): """Test CMAES with Cartpole-v1 environment.""" with LocalRunner() as runner: env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) runner.initialize_tf_vars() n_samples = 20 algo = CMAES( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, n_samples=n_samples) runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler) runner.train(n_epochs=1, batch_size=1000, n_epoch_cycles=n_samples) # No assertion on return because CMAES is not stable. env.close()
def test_dm_control_tf_policy(self): task = ALL_TASKS[0] with LocalRunner(sess=self.sess) as runner: env = TfEnv(DmControlEnv.from_suite(*task)) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(32, 32), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=5, discount=0.99, max_kl_step=0.01, ) runner.setup(algo, env) runner.train(n_epochs=1, batch_size=10) env.close()
def test_baseline(self): """Test the baseline initialization.""" box_env = TfEnv(DummyBoxEnv()) deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) self.sess.run(tf.compat.v1.global_variables_initializer()) deterministic_mlp_baseline.get_param_values() gaussian_mlp_baseline.get_param_values() box_env.close()
def test_dqn_cartpole_pickle(self): """Test DQN with CartPole environment.""" with LocalRunner(self.sess) as runner: n_epochs = 10 n_epoch_cycles = 10 sampler_batch_size = 500 num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN( env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, n_epoch_cycles=n_epoch_cycles, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) with tf.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): bias = tf.get_variable('bias') # assign it to all one old_bias = tf.ones_like(bias).eval() bias.load(old_bias) h = pickle.dumps(algo) with tf.Session(graph=tf.Graph()): pickle.loads(h) with tf.variable_scope( 'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True): new_bias = tf.get_variable('bias') new_bias = new_bias.eval() assert np.array_equal(old_bias, new_bias) env.close()
class TestContinuousPolicies(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(DummyBoxEnv()) self.obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, self.env.observation_space.flat_dim], name='obs') def teardown_method(self): self.env.close() super().teardown_method() def test_continuous_mlp_policy(self): continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) obs = self.env.observation_space.high assert continuous_mlp_policy.get_action(obs) def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_gru_policy.build(self.obs_var) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs) def test_gaussian_lstm_policy(self): gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_lstm_policy.build(self.obs_var) gaussian_lstm_policy.reset() obs = self.env.observation_space.high assert gaussian_lstm_policy.get_action(obs) def test_gaussian_mlp_policy(self): gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) gaussian_mlp_policy.build(self.obs_var) obs = self.env.observation_space.high assert gaussian_mlp_policy.get_action(obs)
class TestContinuousPolicies(TfGraphTestCase): def setUp(self): super().setUp() self.env = TfEnv(DummyBoxEnv()) def tearDown(self): self.env.close() super().tearDown() def test_continuous_mlp_policy(self): continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.global_variables_initializer()) obs = self.env.observation_space.high assert continuous_mlp_policy.get_action(obs) def test_deterministic_mlp_policy(self): deterministic_mlp_policy = DeterministicMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.global_variables_initializer()) obs = self.env.observation_space.high assert deterministic_mlp_policy.get_action(obs) def test_gaussian_gru_policy(self): gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.global_variables_initializer()) gaussian_gru_policy.reset() obs = self.env.observation_space.high assert gaussian_gru_policy.get_action(obs) def test_gaussian_lstm_policy(self): gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.global_variables_initializer()) gaussian_lstm_policy.reset() obs = self.env.observation_space.high assert gaussian_lstm_policy.get_action(obs) def test_gaussian_mlp_policy(self): gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.global_variables_initializer()) obs = self.env.observation_space.high assert gaussian_mlp_policy.get_action(obs)
class TestGrayscale(unittest.TestCase): def setUp(self): self.env = TfEnv(DummyDiscretePixelEnv(random=False)) self.env_g = TfEnv(Grayscale(DummyDiscretePixelEnv(random=False))) def tearDown(self): self.env.close() self.env_g.close() def test_gray_scale_invalid_environment_type(self): with self.assertRaises(ValueError): self.env.observation_space = Discrete(64) Grayscale(self.env) def test_gray_scale_invalid_environment_shape(self): with self.assertRaises(ValueError): self.env.observation_space = Box( low=0, high=255, shape=(4, ), dtype=np.uint8) Grayscale(self.env) def test_grayscale_observation_space(self): assert self.env_g.observation_space.shape == ( self.env.observation_space.shape[:-1]) def test_grayscale_reset(self): """ RGB to grayscale conversion using scikit-image. Weights used for conversion: Y = 0.2125 R + 0.7154 G + 0.0721 B Reference: http://scikit-image.org/docs/dev/api/skimage.color.html#skimage.color.rgb2grey """ gray_scale_output = np.round( np.dot(self.env.reset()[:, :, :3], [0.2125, 0.7154, 0.0721])).astype(np.uint8) np.testing.assert_array_almost_equal(gray_scale_output, self.env_g.reset()) def test_grayscale_step(self): self.env.reset() self.env_g.reset() obs, _, _, _ = self.env.step(1) obs_g, _, _, _ = self.env_g.step(1) gray_scale_output = np.round( np.dot(obs[:, :, :3], [0.2125, 0.7154, 0.0721])).astype(np.uint8) np.testing.assert_array_almost_equal(gray_scale_output, obs_g)
class TestStackFrames(unittest.TestCase): def setUp(self): self.n_frames = 4 self.env = TfEnv(DummyDiscrete2DEnv(random=False)) self.env_s = TfEnv( StackFrames( DummyDiscrete2DEnv(random=False), n_frames=self.n_frames)) self.width, self.height = self.env.observation_space.shape def tearDown(self): self.env.close() self.env_s.close() def test_stack_frames_invalid_environment_type(self): with self.assertRaises(ValueError): self.env.observation_space = Discrete(64) StackFrames(self.env, n_frames=4) def test_stack_frames_invalid_environment_shape(self): with self.assertRaises(ValueError): self.env.observation_space = Box( low=0, high=255, shape=(4, ), dtype=np.uint8) StackFrames(self.env, n_frames=4) def test_stack_frames_output_observation_space(self): assert self.env_s.observation_space.shape == (self.width, self.height, self.n_frames) def test_stack_frames_for_reset(self): frame_stack = self.env.reset() for i in range(self.n_frames - 1): frame_stack = np.dstack((frame_stack, self.env.reset())) np.testing.assert_array_equal(self.env_s.reset(), frame_stack) def test_stack_frames_for_step(self): self.env.reset() self.env_s.reset() frame_stack = np.empty((self.width, self.height, self.n_frames)) for i in range(10): frame_stack = frame_stack[:, :, 1:] obs, _, _, _ = self.env.step(1) frame_stack = np.dstack((frame_stack, obs)) obs_stack, _, _, _ = self.env_s.step(1) np.testing.assert_array_equal(obs_stack, frame_stack)
def test_dqn_cartpole_grad_clip(self): """Test DQN with CartPole environment.""" with LocalRunner(self.sess) as runner: n_epochs = 10 n_epoch_cycles = 10 sampler_batch_size = 500 num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size env = TfEnv(gym.make('CartPole-v0')) replay_buffer = SimpleReplayBuffer( env_spec=env.spec, size_in_transitions=int(1e4), time_horizon=1) qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64)) policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf) epilson_greedy_strategy = EpsilonGreedyStrategy( env_spec=env.spec, total_timesteps=num_timesteps, max_epsilon=1.0, min_epsilon=0.02, decay_ratio=0.1) algo = DQN( env_spec=env.spec, policy=policy, qf=qf, exploration_strategy=epilson_greedy_strategy, replay_buffer=replay_buffer, qf_lr=1e-4, discount=1.0, min_buffer_size=int(1e3), double_q=False, n_train_steps=500, grad_norm_clipping=5.0, n_epoch_cycles=n_epoch_cycles, target_network_update_freq=1, buffer_batch_size=32) runner.setup(algo, env) last_avg_ret = runner.train( n_epochs=n_epochs, n_epoch_cycles=n_epoch_cycles, batch_size=sampler_batch_size) assert last_avg_ret > 20 env.close()
class TestRaySamplerTF(): """ Uses mock policy for 4x4 gridworldenv '4x4': [ 'SFFF', 'FHFH', 'FFFH', 'HFFG' ] 0: left 1: down 2: right 3: up -1: no move 'S' : starting point 'F' or '.': free space 'W' or 'x': wall 'H' or 'o': hole (terminates episode) 'G' : goal [2,2,1,0,3,1,1,1,2,2,1,1,1,2,2,1] """ def setup_method(self): ray.init(local_mode=True, ignore_reinit_error=True) self.env = TfEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock(env_spec=self.env.spec, policy=self.policy, max_path_length=16) def teardown_method(self): self.env.close() def test_ray_batch_sampler(self): workers = WorkerFactory(seed=100, max_path_length=self.algo.max_path_length) sampler1 = RaySamplerTF(workers, self.policy, self.env, num_processors=1) sampler1.start_worker() sampler1.shutdown_worker()
def test_init_with_env_updates(): max_path_length = 16 env = TfEnv(PointEnv()) policy = FixedPolicy(env.spec, scripted_actions=[ env.action_space.sample() for _ in range(max_path_length) ]) tasks = SetTaskSampler(lambda: TfEnv(PointEnv())) n_workers = 8 workers = WorkerFactory(seed=100, max_path_length=max_path_length, n_workers=n_workers) sampler = MultiprocessingSampler.from_worker_factory( workers, policy, envs=tasks.sample(n_workers)) rollouts = sampler.obtain_samples(0, 160, policy) assert sum(rollouts.lengths) >= 160 sampler.shutdown_worker() env.close()
class TestNormalizedGym(unittest.TestCase): def setUp(self): self.env = TfEnv( normalize( gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) def tearDown(self): self.env.close() def test_does_not_modify_action(self): a = self.env.action_space.sample() a_copy = a self.env.reset() self.env.step(a) self.assertEqual(a, a_copy) def test_flatten(self): for _ in range(10): self.env.reset() for _ in range(5): self.env.render() action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) self.assertEqual(next_obs.shape, self.env.observation_space.low.shape) if done: break def test_unflatten(self): for _ in range(10): self.env.reset() for _ in range(5): action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) self.assertEqual( self.env.observation_space.flatten(next_obs).shape, self.env.observation_space.flat_dim) if done: break
class TestDiscretePolicies(TfGraphTestCase): def setup_method(self): super().setup_method() self.env = TfEnv(DummyDiscreteEnv()) self.obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, self.env.observation_space.flat_dim], name='obs') def teardown_method(self): self.env.close() super().teardown_method() def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_gru_policy.build(self.obs_var) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs) def test_categorical_lstm_policy(self): categorical_lstm_policy = CategoricalLSTMPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_lstm_policy.build(self.obs_var) categorical_lstm_policy.reset() obs = self.env.observation_space.high assert categorical_lstm_policy.get_action(obs) def test_categorial_mlp_policy(self): categorical_mlp_policy = CategoricalMLPPolicy(env_spec=self.env, hidden_sizes=(1, )) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_mlp_policy.build(self.obs_var) obs = self.env.observation_space.high assert categorical_mlp_policy.get_action(obs)
def test_ddpg_pendulum_with_decayed_weights(self): """Test DDPG with Pendulum environment and decayed weights. This environment has a [-3, 3] action_space bound. """ with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedPendulum-v2'))) policy = ContinuousMLPPolicy(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh) exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec, policy, sigma=0.2) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=tf.nn.relu) replay_buffer = SimpleReplayBuffer(env_spec=env.spec, size_in_transitions=int(1e6), time_horizon=100) algo = DDPG( env_spec=env.spec, policy=policy, policy_lr=1e-4, qf_lr=1e-3, qf=qf, replay_buffer=replay_buffer, steps_per_epoch=20, target_update_tau=1e-2, n_train_steps=50, discount=0.9, policy_weight_decay=0.01, qf_weight_decay=0.01, min_buffer_size=int(5e3), exploration_policy=exploration_policy, ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=100) assert last_avg_ret > 10 env.close()
class TestNormalizedGym: def setup_method(self): self.env = TfEnv( normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True)) def teardown_method(self): self.env.close() def test_does_not_modify_action(self): a = self.env.action_space.sample() a_copy = a self.env.reset() self.env.step(a) assert a == a_copy def test_flatten(self): for _ in range(10): self.env.reset() for _ in range(5): self.env.render() action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) assert next_obs.shape == self.env.observation_space.low.shape if done: break def test_unflatten(self): for _ in range(10): self.env.reset() for _ in range(5): action = self.env.action_space.sample() next_obs, _, done, _ = self.env.step(action) # yapf: disable assert (self.env.observation_space.flatten(next_obs).shape == self.env.observation_space.flat_dim) # yapf: enable if done: break
def test_ppo_pendulum_recurrent_continuous_baseline(self): """Test PPO with Pendulum environment and recurrent policy.""" with LocalRunner() as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianLSTMPolicy(env_spec=env.spec, ) baseline = ContinuousMLPBaselineWithModel( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10)) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 30 env.close()
def test_baseline(self): """Test the baseline initialization.""" box_env = TfEnv(DummyBoxEnv()) deterministic_mlp_baseline = DeterministicMLPBaseline(env_spec=box_env) gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env) discrete_env = TfEnv(Resize(DummyDiscrete2DEnv(), width=64, height=64)) gaussian_conv_baseline = GaussianConvBaseline( env_spec=discrete_env, regressor_args=dict(conv_filters=[32, 32], conv_filter_sizes=[1, 1], conv_strides=[1, 1], conv_pads=["VALID", "VALID"], hidden_sizes=(32, 32))) self.sess.run(tf.global_variables_initializer()) deterministic_mlp_baseline.get_param_values(trainable=True) gaussian_mlp_baseline.get_param_values(trainable=True) gaussian_conv_baseline.get_param_values(trainable=True) box_env.close()
class TestRepeatAction(unittest.TestCase): def setUp(self): self.env = TfEnv(DummyDiscreteEnv(random=False)) self.env_r = TfEnv( RepeatAction(DummyDiscreteEnv(random=False), n_frame_to_repeat=4)) def tearDown(self): self.env.close() self.env_r.close() def test_repeat_action_reset(self): np.testing.assert_array_equal(self.env.reset(), self.env_r.reset()) def test_repeat_action_step(self): self.env.reset() self.env_r.reset() obs_repeat, _, _, _ = self.env_r.step(1) for i in range(4): obs, _, _, _ = self.env.step(1) np.testing.assert_array_equal(obs, obs_repeat)
def test_erwr_cartpole(self): """Test ERWR with Cartpole-v1 environment.""" env = TfEnv(env_name="CartPole-v1") policy = CategoricalMLPPolicy(name="policy", env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = ERWR(env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=100, n_itr=10, discount=0.99) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 80 env.close()
def test_reps_cartpole(self): """Test REPS with gym Cartpole environment.""" with LocalRunner(sess=self.sess) as runner: env = TfEnv(gym.make('CartPole-v0')) policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=[32, 32]) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = REPS(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=4000) assert last_avg_ret > 5 env.close()