def test_policy_gradient(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]]) # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta # = -1/2 sum_{i=0}^1 * 1 * [1,1,1,1] # = - [1,1,1,1] feed_dict = algo._policy_feed_dict(obs) loss_grad_ops = tf.gradients(algo.policy_surrogate_loss, algo.policy.get_params_internal()) actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict) actual_loss_grads_flat = np.vstack(actual_loss_grads).flatten() expected = [ -1 * np.ones_like(v) for v in algo.policy.get_param_values() ] self.assertTrue( are_np_array_iterables_equal(actual_loss_grads_flat, expected))
def test_policy_gradient2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) obs = np.array([[1., -10., 1., 2.], [1., 100., 1., 2.]]) # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta # = -1/2 * 1 * [1,-10,1,2] # + -1/2 * 1 * [1,100,1,2] # = - [1., 45., 1., 2.] feed_dict = algo._policy_feed_dict(obs) loss_grad_ops = tf.gradients(algo.policy_surrogate_loss, algo.policy.get_params_internal()) actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict) expected = [np.array([[-1.], [-45.], [-1.], [-2.]])] self.assertTrue( are_np_array_iterables_equal(actual_loss_grads, expected))
def test_policy_surrogate_loss2(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) obs = np.array([[0., 1., 1., -11.], [5., 10., 10., -10.]]) # loss = -1/N sum_i Q(s_i, u(s_i)) # = -1/2 * {(Q([0,1,1,-11], u([0,1,1,-11])) # + Q([5,10,10,-10], u([5,10,10,-10]))} # = -1/2 * {Q([0,1,1,-11], -9)) + Q([5,10,10,-10], 15))} # = -1/2 * (-18 + 30) # = -6 feed_dict = algo._policy_feed_dict(obs) actual = algo.sess.run(algo.policy_surrogate_loss, feed_dict=feed_dict) self.assertEqual(actual, -6.) self.assertEqual(np.float32, type(actual))
def test_only_policy_values_change(self): discount = 0.5 algo = DDPG( self.env, self.es, self.sum_policy, self.sum_critic, n_epochs=0, epoch_length=0, eval_samples=0, # Ignore eval. Just do this to remove warnings. discount=discount, ) old_qf_values = algo.qf.get_param_values() old_qf_copy_values = (algo.qf_with_action_input.get_param_values()) old_policy_values = algo.policy.get_param_values() old_target_qf_values = algo.target_qf.get_param_values() old_target_policy_values = algo.target_policy.get_param_values() obs = np.array([[1., 1., 1., 1.]]) feed_dict = algo._policy_feed_dict(obs) algo.sess.run(algo.train_policy_op, feed_dict=feed_dict) new_qf_values = algo.qf.get_param_values() new_qf_copy_values = (algo.qf_with_action_input.get_param_values()) new_policy_values = algo.policy.get_param_values() new_target_qf_values = algo.target_qf.get_param_values() new_target_policy_values = algo.target_policy.get_param_values() self.assertFalse( are_np_array_iterables_equal(old_policy_values, new_policy_values)) self.assertTrue( are_np_array_iterables_equal(old_qf_values, new_qf_values)) self.assertTrue( are_np_array_iterables_equal(old_qf_copy_values, new_qf_copy_values)) self.assertTrue( are_np_array_iterables_equal(old_target_policy_values, new_target_policy_values)) self.assertTrue( are_np_array_iterables_equal(old_target_qf_values, new_target_qf_values))