Beispiel #1
0
 def setUp(self):
     super().setUp()
     self.env = gym.make('MountainCarContinuous-v0')
     self.agent = DDPG(hidden_sizes=(4, ))
     self.obs_dim = 2
     self.act_dim = 1
     self.batch_size = 6
     self.obs_ph = tf_utils.tfph(self.obs_dim, 'obs')
     self.act_ph = tf_utils.tfph(self.act_dim, 'act')
     self.is_term_ph = tf_utils.tfph(None, 'is_term')
     self.rew_ph = tf_utils.tfph(None, 'rew')
     self.placeholders = {
         'obs': self.obs_ph,
         'act': self.act_ph,
         'next_obs': self.obs_ph,
         'is_term': self.is_term_ph,
         'rew': self.rew_ph
     }
     self.obs = np.random.randn(self.batch_size, self.obs_dim)
     self.act = np.random.randn(self.batch_size, self.act_dim)
     self.is_term = np.random.randint(0, 2,
                                      self.batch_size).astype(np.float32)
     self.rew = np.random.randn(self.batch_size)
     self.feed_dict = {
         self.obs_ph: self.obs,
         self.act_ph: self.act,
         self.is_term_ph: self.is_term,
         self.rew_ph: self.rew
     }
 def test_build_policy_loss(self):
     """ make sure that when we train the loss, logp of actions with high
     advantage go up, and vice versa. Also check that kl-div goes up as we
     update the policy """
     learning_rate = 1e-3
     logp_old_ph = tfph(None)
     logp_old = np.log(np.random.rand(self.batch_size)).astype(np.float32)
     adv_ph = tfph(None)
     adv = np.random.randn(self.batch_size).astype(np.float32)
     logp = tf.get_variable('logp', dtype=tf.float32, trainable=True,
                            initializer=logp_old)
     placeholders = {'logp': logp_old_ph, 'adv': adv}
     pi_loss, pi_train_op = self.ppo.build_policy_loss(
         logp, placeholders, learning_rate)
     feed_dict = {logp_old_ph: logp_old, adv_ph: adv}
     with self.cached_session() as sess:
         sess.run(tf.global_variables_initializer())
         init_loss, init_kl = sess.run((pi_loss, self.ppo.kl_divergence),
                                       feed_dict=feed_dict)
         self.assertAlmostEqual(init_loss, -np.mean(adv), places=5)
         # since the new and old policies are the before training, kl
         # divergence should be zero
         self.assertAlmostEqual(init_kl, 0)
         sess.run(pi_train_op, feed_dict=feed_dict)
         after_loss, after_kl = sess.run((pi_loss, self.ppo.kl_divergence),
                                       feed_dict=feed_dict)
         # ensure the loss went down
         self.assertLess(after_loss, init_loss)
         delta_logp = sess.run(logp) - logp_old
         # ensure that logp goes up if adv > 0 and vice versa
         np.testing.assert_array_equal(np.sign(delta_logp),
                                       np.sign(adv))
         # ensure that kl_div changed
         self.assertNotEqual(after_kl, init_kl)
 def test_build_clipped_surrogate(self):
     """ smoke test build_clipped_surrogate """
     sur_ph = tfph(None)
     max_sur_ph = tfph(None)
     sur = np.random.randn(self.batch_size)
     max_sur = np.random.randn(self.batch_size)
     expected = np.array([min(sur_i, max_sur_i) for (sur_i, max_sur_i)
                          in zip(sur, max_sur)])
     with self.cached_session() as sess:
         ret = sess.run(
             self.ppo.build_clipped_surrogate(sur_ph, max_sur_ph),
             feed_dict={sur_ph: sur, max_sur_ph: max_sur})
     np.testing.assert_allclose(ret, expected)
Beispiel #4
0
 def create_placeholders(self, obs_space, act_space):
     """ Build the placeholders required for this agent """
     self.placeholders['obs'] = tfph(obs_space.shape[-1], name='obs')
     if isinstance(act_space, Box):
         self.placeholders['act'] = tfph(act_space.shape[-1], name='act')
     elif isinstance(act_space, Discrete):
         self.placeholders['act'] = tf.placeholder(dtype=tf.int64,
                                                   shape=[None],
                                                   name='act')
     else:
         raise NotImplementedError(
             'action space {} not implemented'.format(act_space))
     for name in ('ret', 'adv'):
         self.placeholders[name] = tfph(None, name=name)
Beispiel #5
0
 def setUp(self):
     super().setUp()
     self.act_dim = 3
     self.batch_size = 12
     self.obs_dim = 5
     self.obs_ph = tf_utils.tfph(self.obs_dim)
     self.obs = np.random.randn(self.batch_size, self.obs_dim)
 def test_tfph_smoke(self):
     """ smoke test tfph """
     x_dim = 3
     x = np.random.rand(8, x_dim)
     x_ph = tf_utils.tfph(x_dim)
     with self.cached_session() as sess:
         ret = sess.run(x_ph, feed_dict={x_ph: x})
         np.testing.assert_almost_equal(x, ret)
 def test_tfph_None(self):
     """ test tfph when size is None"""
     x_dim = None
     x = np.random.rand(8)
     x_ph = tf_utils.tfph(None, name='x')
     self.assertTrue(x_ph.name.startswith('x'))
     with self.cached_session() as sess:
         ret = sess.run(x_ph, feed_dict={x_ph: x})
         np.testing.assert_almost_equal(x, ret)
 def test_build_max_surrogate(self):
     """ smoke test build_max_surrogate """
     clip_ratio = 0.2
     adv_ph = tfph(None)
     adv = np.random.randn(self.batch_size)
     expected = (1 + clip_ratio*np.sign(adv))*adv
     with self.cached_session() as sess:
         ret = sess.run(self.ppo.build_max_surrogate(clip_ratio, adv_ph),
                        feed_dict={adv_ph: adv})
     self.assertEqual(ret.shape, (self.batch_size,))
     np.testing.assert_allclose(expected, ret)
Beispiel #9
0
 def test_build_qval_target(self):
     """ test building targets for qval loss """
     qval_pi_targ_ph = tf_utils.tfph(None, 'qval_pi_targ')
     qval_pi_targ_np = np.random.randn(self.batch_size)
     expected = self.rew \
         + (1 - self.is_term)*self.agent.gamma*qval_pi_targ_np
     target = self.agent.build_qval_target(qval_pi_targ_ph,
                                           self.placeholders)
     feed_dict = {qval_pi_targ_ph: qval_pi_targ_np, **self.feed_dict}
     with self.cached_session() as sess:
         ret = sess.run(target, feed_dict=feed_dict)
     np.testing.assert_allclose(expected, ret)
Beispiel #10
0
 def test_log_prob_of_action(self):
     """ smoke test log_prob_of_action """
     log_probs_ph = tf_utils.tfph(self.n_cat)
     log_probs = np.random.rand(self.batch_size, self.n_cat)
     actions_ph = tf.placeholder(dtype=tf.int64, shape=[None])
     actions = np.random.randint(0, self.n_cat, self.batch_size, np.int64)
     with self.cached_session() as sess:
         ret = sess.run(categorical.log_prob_of_action(
             log_probs_ph, actions_ph),
                        feed_dict={
                            log_probs_ph: log_probs,
                            actions_ph: actions
                        })
     for ind in range(self.batch_size):
         self.assertAlmostEqual(ret[ind], log_probs[ind, actions[ind]])
 def test_build_value_function_smoke(self):
     """ check the num of trainable params and output shape make sense for
     build_value_func """
     obs_dim = self.env.observation_space.shape[0]
     obs_ph = tf_utils.tfph(obs_dim)
     obs = np.random.rand(8, obs_dim)
     val = self.vpg.build_value_function(obs_ph,
                                         hidden_sizes=(4, ),
                                         activation=None)
     n_params = (obs_dim + 1) * 4 + (4 + 1) * 1
     with self.cached_session() as sess:
         ret_n_params = tf_utils.trainable_count(scope='val')
         sess.run(tf.global_variables_initializer())
         sess_val = sess.run(val, feed_dict={obs_ph: obs})
     self.assertEqual(n_params, ret_n_params)
     self.assertEqual(sess_val.shape, (8, ))
 def create_placeholders(self, obs_space, act_space):
     """ create placeholders """
     if not isinstance(act_space, Box):
         raise NotImplementedError(
             "action space {} not implemented".format(type(act_space)) +
             "NOTE DDPG only compatible with continuous actions spaces")
     act_dim, obs_dim = act_space.shape[-1], obs_space.shape[-1]
     ph_shapes = {
         'act': act_dim,
         'obs': obs_dim,
         'next_obs': obs_dim,
         'is_term': None,
         'rew': None
     }
     self.placeholders = {
         name: tf_utils.tfph(shape, name=name)
         for name, shape in ph_shapes.items()
     }
Beispiel #13
0
 def test_mlp_categorical_policy(self):
     """ smoke test mlp_categorical_policy """
     action_space = Mock()
     action_space.n = self.n_cat
     obs_ph = tf_utils.tfph(2)
     obs = np.random.rand(self.batch_size, 2)
     act_ph = tf.placeholder(dtype=tf.int64, shape=[None])
     act = np.random.randint(0, self.n_cat, self.batch_size, np.int64)
     with self.cached_session() as sess:
         ret_symbol = categorical.mlp_categorical_policy(
             obs_ph,
             act_ph,
             hidden_sizes=(4, ),
             activation=tf.tanh,
             action_space=action_space)
         sess.run(tf.global_variables_initializer())
         ret = sess.run(ret_symbol, feed_dict={obs_ph: obs, act_ph: act})
     for val in ret:
         self.assertEqual(val.shape, (self.batch_size, ))
 def test_build_val_loss_smoke(self):
     """ Make sure the loss goes down when training, and that training
     brings val closer to rets """
     batch_size = 4
     ret_ph = tf_utils.tfph(None)
     ret = np.ones(batch_size)
     val = tf.get_variable('val',
                           dtype=tf.float32,
                           trainable=True,
                           initializer=4 * [0.])
     loss, train_op = self.vpg.build_val_loss(val, ret_ph, 1e-3)
     with self.cached_session() as sess:
         sess.run(tf.global_variables_initializer())
         old_loss = sess.run(loss, feed_dict={ret_ph: ret})
         sess.run(train_op, feed_dict={ret_ph: ret})
         new_loss = sess.run(loss, feed_dict={ret_ph: ret})
         new_val = sess.run(val)
     self.assertEqual(new_loss.shape, tuple())
     self.assertLess(new_loss, old_loss)
     self.assertTrue(all(new_val > 0))
 def test_build_policy_loss_smoke(self):
     """ Make sure the loss goes down when training, and that training
     changes logp in the expected direction """
     batch_size = 4
     adv_ph = tfph(None)
     adv = np.ones(batch_size)
     logp = tf.get_variable('adv',
                            dtype=tf.float32,
                            trainable=True,
                            initializer=batch_size * [0.])
     loss, train_op = self.vpg.build_policy_loss(logp, {'adv': adv_ph},
                                                 learning_rate=1e-3)
     with self.cached_session() as sess:
         sess.run(tf.global_variables_initializer())
         old_loss = sess.run(loss, feed_dict={adv_ph: adv})
         sess.run(train_op, feed_dict={adv_ph: adv})
         new_loss = sess.run(loss, feed_dict={adv_ph: adv})
         new_logp = sess.run(logp)
     self.assertEqual(new_loss.shape, tuple())
     self.assertLess(new_loss, old_loss)
     self.assertTrue(all(new_logp > 0))
Beispiel #16
0
 def setUp(self):
     super().setUp()
     self.n_cat = 5
     self.batch_size = 12
     self.logits_ph = tf_utils.tfph(5)
     self.logits = np.random.rand(self.batch_size, self.n_cat)
Beispiel #17
0
 def create_placeholders(self, obs_space, act_space):
     """ we need logp for the training the policy loss, so we'll add a
     placeholder for it here """
     super().create_placeholders(obs_space, act_space)
     self.placeholders['logp'] = tfph(None, name='logp')