def test_get_action(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_mlp_policy_with_model.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) action, prob = policy.get_action(obs) expected_action = np.full(action_dim, 0.75) expected_mean = np.full(action_dim, 0.5) expected_log_std = np.full(action_dim, 0.5) assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std) actions, probs = policy.get_actions([obs, obs, obs]) for action, mean, log_std in zip(actions, probs['mean'], probs['log_std']): assert env.action_space.contains(action) assert np.array_equal(action, expected_action) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std)
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" with LocalRunner(self.sess) as runner: env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2'))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaselineWithModel( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), ) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 30 env.close()
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_mlp_policy_with_model.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim with tf.variable_scope('GaussianMLPPolicyWithModel/GaussianMLPModel', reuse=True): return_var = tf.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs[:-1], feed_dict={policy.model.input: [obs.flatten()]}) p = pickle.dumps(policy) with tf.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) output2 = sess.run( policy_pickled.model.outputs[:-1], feed_dict={policy_pickled.model.input: [obs.flatten()]}) assert np.array_equal(output1, output2)
def test_ppo_pendulum_with_model(self): """Test PPO with model, with Pendulum environment.""" logger.reset() env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2"))) policy = GaussianMLPPolicyWithModel( env_spec=env.spec, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, ) baseline = GaussianMLPBaseline( env_spec=env.spec, regressor_args=dict(hidden_sizes=(32, 32)), ) algo = PPO( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=100, n_itr=10, discount=0.99, lr_clip_range=0.01, optimizer_args=dict(batch_size=32, max_epochs=10), plot=False, ) last_avg_ret = algo.train(sess=self.sess) assert last_avg_ret > 40 env.close()
def setup_method(self): with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 super().setup_method() self.box_env = TfEnv(DummyBoxEnv()) self.policy1 = GaussianMLPPolicy(env_spec=self.box_env, init_std=1.0, name='P1') self.policy2 = GaussianMLPPolicy(env_spec=self.box_env, init_std=1.2, name='P2') self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env, init_std=1.0, name='P3') self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env, init_std=1.2, name='P4') self.sess.run(tf.global_variables_initializer()) for a, b in zip(self.policy3.get_params(), self.policy1.get_params()): self.sess.run(tf.assign(b, a)) for a, b in zip(self.policy4.get_params(), self.policy2.get_params()): self.sess.run(tf.assign(b, a)) self.obs = [self.box_env.reset()] self.obs_ph = tf.placeholder( tf.float32, shape=(None, self.box_env.observation_space.flat_dim)) self.action_ph = tf.placeholder( tf.float32, shape=(None, self.box_env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph, name='p4_sym') assert self.policy1.vectorized == self.policy2.vectorized assert self.policy3.vectorized == self.policy4.vectorized
def test_is_pickleable(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_mlp_policy_with_model.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim action1, prob1 = policy.get_action(obs) p = pickle.dumps(policy) with tf.Session(graph=tf.Graph()): policy_pickled = pickle.loads(p) action2, prob2 = policy_pickled.get_action(obs) assert env.action_space.contains(action1) assert np.array_equal(action1, action2) assert np.array_equal(prob1['mean'], prob2['mean']) assert np.array_equal(prob1['log_std'], prob2['log_std'])
def test_dist_info_sym(self, obs_dim, action_dim): env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'gaussian_mlp_policy_with_model.GaussianMLPModel'), new=SimpleGaussianMLPModel): policy = GaussianMLPPolicyWithModel(env_spec=env.spec) env.reset() obs, _, _, _ = env.step(1) obs_dim = env.spec.observation_space.flat_dim obs_ph = tf.placeholder(tf.float32, shape=(None, obs_dim)) dist1_sym = policy.dist_info_sym(obs_ph, name='p1_sym') expected_mean = np.full(action_dim, 0.5) expected_log_std = np.full(action_dim, 0.5) prob = self.sess.run(dist1_sym, feed_dict={obs_ph: [obs.flatten()]}) assert np.array_equal(prob['mean'], expected_mean) assert np.array_equal(prob['log_std'], expected_log_std)
def run_task(snapshot_config, *_): with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = TfEnv(gym.make('Swimmer-v2')) policy = GaussianMLPPolicyWithModel(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env, sampler_cls=RaySamplerTF, sampler_args={'seed': seed}) runner.train(n_epochs=40, batch_size=4000)
class TestGaussianMLPPolicyWithModelTransit(TfGraphTestCase): def setup_method(self): with mock.patch('tensorflow.random.normal') as mock_rand: mock_rand.return_value = 0.5 super().setup_method() self.box_env = TfEnv(DummyBoxEnv()) self.policy1 = GaussianMLPPolicy(env_spec=self.box_env, init_std=1.0, name='P1') self.policy2 = GaussianMLPPolicy(env_spec=self.box_env, init_std=1.2, name='P2') self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env, init_std=1.0, name='P3') self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env, init_std=1.2, name='P4') self.sess.run(tf.global_variables_initializer()) for a, b in zip(self.policy3.get_params(), self.policy1.get_params()): self.sess.run(tf.assign(b, a)) for a, b in zip(self.policy4.get_params(), self.policy2.get_params()): self.sess.run(tf.assign(b, a)) self.obs = [self.box_env.reset()] self.obs_ph = tf.placeholder( tf.float32, shape=(None, self.box_env.observation_space.flat_dim)) self.action_ph = tf.placeholder( tf.float32, shape=(None, self.box_env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph, name='p4_sym') assert self.policy1.vectorized == self.policy2.vectorized assert self.policy3.vectorized == self.policy4.vectorized def test_dist_info_sym_output(self): dist1 = self.sess.run(self.dist1_sym, feed_dict={self.obs_ph: self.obs}) dist2 = self.sess.run(self.dist2_sym, feed_dict={self.obs_ph: self.obs}) dist3 = self.sess.run(self.dist3_sym, feed_dict={self.obs_ph: self.obs}) dist4 = self.sess.run(self.dist4_sym, feed_dict={self.obs_ph: self.obs}) assert np.array_equal(dist1['mean'], dist3['mean']) assert np.array_equal(dist1['log_std'], dist3['log_std']) assert np.array_equal(dist2['mean'], dist4['mean']) assert np.array_equal(dist2['log_std'], dist4['log_std']) @mock.patch('numpy.random.normal') def test_get_action(self, mock_rand): mock_rand.return_value = 0.5 action1, _ = self.policy1.get_action(self.obs) action2, _ = self.policy2.get_action(self.obs) action3, _ = self.policy3.get_action(self.obs) action4, _ = self.policy4.get_action(self.obs) assert np.array_equal(action1, action3) assert np.array_equal(action2, action4) actions1, dist_info1 = self.policy1.get_actions([self.obs]) actions2, dist_info2 = self.policy2.get_actions([self.obs]) actions3, dist_info3 = self.policy3.get_actions([self.obs]) actions4, dist_info4 = self.policy4.get_actions([self.obs]) assert np.array_equal(actions1, actions3) assert np.array_equal(actions2, actions4) assert np.array_equal(dist_info1['mean'], dist_info3['mean']) assert np.array_equal(dist_info1['log_std'], dist_info3['log_std']) assert np.array_equal(dist_info2['mean'], dist_info4['mean']) assert np.array_equal(dist_info2['log_std'], dist_info4['log_std']) def test_kl_sym(self): kl_diff_sym1 = self.policy1.distribution.kl_sym( self.dist1_sym, self.dist2_sym) objective1 = tf.reduce_mean(kl_diff_sym1) kl_func = tensor_utils.compile_function([self.obs_ph], objective1) kl1 = kl_func(self.obs, self.obs) kl_diff_sym2 = self.policy3.distribution.kl_sym( self.dist3_sym, self.dist4_sym) objective2 = tf.reduce_mean(kl_diff_sym2) kl_func = tensor_utils.compile_function([self.obs_ph], objective2) kl2 = kl_func(self.obs, self.obs) assert np.array_equal(kl1, kl2) assert kl1 == pytest.approx(kl2) def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func(self.obs, [[1, 1]]) log_prob_sym2 = self.policy3.model.networks[ 'default'].dist.log_likelihood_sym(self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2(self.obs, [[1, 1]]) assert log_prob1 == log_prob2 log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func(self.obs, [[1, 1]]) log_prob_sym2 = self.policy4.model.networks[ 'default'].dist.log_likelihood_sym(self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2(self.obs, [[1, 1]]) assert log_prob1 == log_prob2 def test_policy_entropy_sym(self): entropy_sym1 = self.policy1.distribution.entropy_sym( self.dist1_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym1) entropy1 = entropy_func(self.obs) entropy_sym2 = self.policy3.distribution.entropy_sym( self.dist3_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym2) entropy2 = entropy_func(self.obs) assert entropy1 == entropy2 def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func([[1, 1]], self.obs) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func([[1, 1]], self.obs) assert likelihood_ratio1 == likelihood_ratio2