def test_get_action(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_mlp_policy_with_model.GaussianMLPModel'),
                        new=SimpleGaussianMLPModel):
            policy = GaussianMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        action, prob = policy.get_action(obs)

        expected_action = np.full(action_dim, 0.75)
        expected_mean = np.full(action_dim, 0.5)
        expected_log_std = np.full(action_dim, 0.5)

        assert env.action_space.contains(action)
        assert np.array_equal(action, expected_action)
        assert np.array_equal(prob['mean'], expected_mean)
        assert np.array_equal(prob['log_std'], expected_log_std)

        actions, probs = policy.get_actions([obs, obs, obs])
        for action, mean, log_std in zip(actions, probs['mean'],
                                         probs['log_std']):
            assert env.action_space.contains(action)
            assert np.array_equal(action, expected_action)
            assert np.array_equal(prob['mean'], expected_mean)
            assert np.array_equal(prob['log_std'], expected_log_std)
Exemple #2
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicyWithModel(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 30

            env.close()
    def test_is_pickleable(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_mlp_policy_with_model.GaussianMLPModel'),
                        new=SimpleGaussianMLPModel):
            policy = GaussianMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)
        obs_dim = env.spec.observation_space.flat_dim

        with tf.variable_scope('GaussianMLPPolicyWithModel/GaussianMLPModel',
                               reuse=True):
            return_var = tf.get_variable('return_var')
        # assign it to all one
        return_var.load(tf.ones_like(return_var).eval())
        output1 = self.sess.run(
            policy.model.outputs[:-1],
            feed_dict={policy.model.input: [obs.flatten()]})

        p = pickle.dumps(policy)
        with tf.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(
                policy_pickled.model.outputs[:-1],
                feed_dict={policy_pickled.model.input: [obs.flatten()]})
            assert np.array_equal(output1, output2)
Exemple #4
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            lr_clip_range=0.01,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40

        env.close()
Exemple #5
0
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            self.box_env = TfEnv(DummyBoxEnv())
            self.policy1 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.0,
                                             name='P1')
            self.policy2 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.2,
                                             name='P2')
            self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.0,
                                                      name='P3')
            self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.2,
                                                      name='P4')

            self.sess.run(tf.global_variables_initializer())

            for a, b in zip(self.policy3.get_params(),
                            self.policy1.get_params()):
                self.sess.run(tf.assign(b, a))
            for a, b in zip(self.policy4.get_params(),
                            self.policy2.get_params()):
                self.sess.run(tf.assign(b, a))

            self.obs = [self.box_env.reset()]
            self.obs_ph = tf.placeholder(
                tf.float32,
                shape=(None, self.box_env.observation_space.flat_dim))
            self.action_ph = tf.placeholder(
                tf.float32, shape=(None, self.box_env.action_space.flat_dim))

            self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph,
                                                        name='p1_sym')
            self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph,
                                                        name='p2_sym')
            self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph,
                                                        name='p3_sym')
            self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph,
                                                        name='p4_sym')

            assert self.policy1.vectorized == self.policy2.vectorized
            assert self.policy3.vectorized == self.policy4.vectorized
Exemple #6
0
    def test_is_pickleable(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_mlp_policy_with_model.GaussianMLPModel'),
                        new=SimpleGaussianMLPModel):
            policy = GaussianMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)
        obs_dim = env.spec.observation_space.flat_dim

        action1, prob1 = policy.get_action(obs)

        p = pickle.dumps(policy)
        with tf.Session(graph=tf.Graph()):
            policy_pickled = pickle.loads(p)
            action2, prob2 = policy_pickled.get_action(obs)

        assert env.action_space.contains(action1)
        assert np.array_equal(action1, action2)
        assert np.array_equal(prob1['mean'], prob2['mean'])
        assert np.array_equal(prob1['log_std'], prob2['log_std'])
    def test_dist_info_sym(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_mlp_policy_with_model.GaussianMLPModel'),
                        new=SimpleGaussianMLPModel):
            policy = GaussianMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        obs_dim = env.spec.observation_space.flat_dim
        obs_ph = tf.placeholder(tf.float32, shape=(None, obs_dim))

        dist1_sym = policy.dist_info_sym(obs_ph, name='p1_sym')

        expected_mean = np.full(action_dim, 0.5)
        expected_log_std = np.full(action_dim, 0.5)

        prob = self.sess.run(dist1_sym, feed_dict={obs_ph: [obs.flatten()]})

        assert np.array_equal(prob['mean'], expected_mean)
        assert np.array_equal(prob['log_std'], expected_log_std)
Exemple #8
0
def run_task(snapshot_config, *_):
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('Swimmer-v2'))

        policy = GaussianMLPPolicyWithModel(env_spec=env.spec,
                                            hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo,
                     env,
                     sampler_cls=RaySamplerTF,
                     sampler_args={'seed': seed})
        runner.train(n_epochs=40, batch_size=4000)
Exemple #9
0
class TestGaussianMLPPolicyWithModelTransit(TfGraphTestCase):
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            self.box_env = TfEnv(DummyBoxEnv())
            self.policy1 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.0,
                                             name='P1')
            self.policy2 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.2,
                                             name='P2')
            self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.0,
                                                      name='P3')
            self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.2,
                                                      name='P4')

            self.sess.run(tf.global_variables_initializer())

            for a, b in zip(self.policy3.get_params(),
                            self.policy1.get_params()):
                self.sess.run(tf.assign(b, a))
            for a, b in zip(self.policy4.get_params(),
                            self.policy2.get_params()):
                self.sess.run(tf.assign(b, a))

            self.obs = [self.box_env.reset()]
            self.obs_ph = tf.placeholder(
                tf.float32,
                shape=(None, self.box_env.observation_space.flat_dim))
            self.action_ph = tf.placeholder(
                tf.float32, shape=(None, self.box_env.action_space.flat_dim))

            self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph,
                                                        name='p1_sym')
            self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph,
                                                        name='p2_sym')
            self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph,
                                                        name='p3_sym')
            self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph,
                                                        name='p4_sym')

            assert self.policy1.vectorized == self.policy2.vectorized
            assert self.policy3.vectorized == self.policy4.vectorized

    def test_dist_info_sym_output(self):
        dist1 = self.sess.run(self.dist1_sym,
                              feed_dict={self.obs_ph: self.obs})
        dist2 = self.sess.run(self.dist2_sym,
                              feed_dict={self.obs_ph: self.obs})
        dist3 = self.sess.run(self.dist3_sym,
                              feed_dict={self.obs_ph: self.obs})
        dist4 = self.sess.run(self.dist4_sym,
                              feed_dict={self.obs_ph: self.obs})

        assert np.array_equal(dist1['mean'], dist3['mean'])
        assert np.array_equal(dist1['log_std'], dist3['log_std'])
        assert np.array_equal(dist2['mean'], dist4['mean'])
        assert np.array_equal(dist2['log_std'], dist4['log_std'])

    @mock.patch('numpy.random.normal')
    def test_get_action(self, mock_rand):
        mock_rand.return_value = 0.5
        action1, _ = self.policy1.get_action(self.obs)
        action2, _ = self.policy2.get_action(self.obs)
        action3, _ = self.policy3.get_action(self.obs)
        action4, _ = self.policy4.get_action(self.obs)

        assert np.array_equal(action1, action3)
        assert np.array_equal(action2, action4)

        actions1, dist_info1 = self.policy1.get_actions([self.obs])
        actions2, dist_info2 = self.policy2.get_actions([self.obs])
        actions3, dist_info3 = self.policy3.get_actions([self.obs])
        actions4, dist_info4 = self.policy4.get_actions([self.obs])

        assert np.array_equal(actions1, actions3)
        assert np.array_equal(actions2, actions4)

        assert np.array_equal(dist_info1['mean'], dist_info3['mean'])
        assert np.array_equal(dist_info1['log_std'], dist_info3['log_std'])
        assert np.array_equal(dist_info2['mean'], dist_info4['mean'])
        assert np.array_equal(dist_info2['log_std'], dist_info4['log_std'])

    def test_kl_sym(self):
        kl_diff_sym1 = self.policy1.distribution.kl_sym(
            self.dist1_sym, self.dist2_sym)
        objective1 = tf.reduce_mean(kl_diff_sym1)

        kl_func = tensor_utils.compile_function([self.obs_ph], objective1)
        kl1 = kl_func(self.obs, self.obs)

        kl_diff_sym2 = self.policy3.distribution.kl_sym(
            self.dist3_sym, self.dist4_sym)
        objective2 = tf.reduce_mean(kl_diff_sym2)

        kl_func = tensor_utils.compile_function([self.obs_ph], objective2)
        kl2 = kl_func(self.obs, self.obs)

        assert np.array_equal(kl1, kl2)
        assert kl1 == pytest.approx(kl2)

    def test_log_likehihood_sym(self):
        log_prob_sym1 = self.policy1.distribution.log_likelihood_sym(
            self.action_ph, self.dist1_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func(self.obs, [[1, 1]])

        log_prob_sym2 = self.policy3.model.networks[
            'default'].dist.log_likelihood_sym(self.action_ph, self.dist3_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2(self.obs, [[1, 1]])
        assert log_prob1 == log_prob2

        log_prob_sym1 = self.policy2.distribution.log_likelihood_sym(
            self.action_ph, self.dist2_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func(self.obs, [[1, 1]])

        log_prob_sym2 = self.policy4.model.networks[
            'default'].dist.log_likelihood_sym(self.action_ph, self.dist4_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2(self.obs, [[1, 1]])
        assert log_prob1 == log_prob2

    def test_policy_entropy_sym(self):
        entropy_sym1 = self.policy1.distribution.entropy_sym(
            self.dist1_sym, name='entropy_sym1')
        entropy_func = tensor_utils.compile_function([self.obs_ph],
                                                     entropy_sym1)
        entropy1 = entropy_func(self.obs)

        entropy_sym2 = self.policy3.distribution.entropy_sym(
            self.dist3_sym, name='entropy_sym1')
        entropy_func = tensor_utils.compile_function([self.obs_ph],
                                                     entropy_sym2)
        entropy2 = entropy_func(self.obs)
        assert entropy1 == entropy2

    def test_likelihood_ratio_sym(self):
        likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist1_sym,
            self.dist2_sym,
            name='li_ratio_sym1')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym1)
        likelihood_ratio1 = likelihood_ratio_func([[1, 1]], self.obs)

        likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist3_sym,
            self.dist4_sym,
            name='li_ratio_sym2')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym2)
        likelihood_ratio2 = likelihood_ratio_func([[1, 1]], self.obs)

        assert likelihood_ratio1 == likelihood_ratio2