def test_policy_gradient(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        obs = np.array([[1., 1., 1., 1.], [1., 1., 1., 1.]])

        # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta
        #      = -1/2 sum_{i=0}^1 * 1 * [1,1,1,1]
        #      = - [1,1,1,1]
        feed_dict = algo._policy_feed_dict(obs)
        loss_grad_ops = tf.gradients(algo.policy_surrogate_loss,
                                     algo.policy.get_params_internal())
        actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict)
        actual_loss_grads_flat = np.vstack(actual_loss_grads).flatten()
        expected = [
            -1 * np.ones_like(v) for v in algo.policy.get_param_values()
        ]
        self.assertTrue(
            are_np_array_iterables_equal(actual_loss_grads_flat, expected))
    def test_policy_gradient2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        obs = np.array([[1., -10., 1., 2.], [1., 100., 1., 2.]])

        # grad = -1/N sum_{i=0}^N * dQ/da * da/dtheta
        #      = -1/2 * 1 * [1,-10,1,2]
        #         + -1/2 * 1 * [1,100,1,2]
        #      = - [1., 45., 1., 2.]
        feed_dict = algo._policy_feed_dict(obs)
        loss_grad_ops = tf.gradients(algo.policy_surrogate_loss,
                                     algo.policy.get_params_internal())
        actual_loss_grads = algo.sess.run(loss_grad_ops, feed_dict=feed_dict)
        expected = [np.array([[-1.], [-45.], [-1.], [-2.]])]
        self.assertTrue(
            are_np_array_iterables_equal(actual_loss_grads, expected))
    def test_policy_surrogate_loss2(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )

        obs = np.array([[0., 1., 1., -11.], [5., 10., 10., -10.]])

        # loss = -1/N sum_i Q(s_i, u(s_i))
        #      = -1/2 * {(Q([0,1,1,-11], u([0,1,1,-11]))
        #                + Q([5,10,10,-10], u([5,10,10,-10]))}
        #      = -1/2 * {Q([0,1,1,-11], -9)) + Q([5,10,10,-10], 15))}
        #      = -1/2 * (-18 + 30)
        #      = -6
        feed_dict = algo._policy_feed_dict(obs)
        actual = algo.sess.run(algo.policy_surrogate_loss, feed_dict=feed_dict)
        self.assertEqual(actual, -6.)
        self.assertEqual(np.float32, type(actual))
    def test_only_policy_values_change(self):
        discount = 0.5
        algo = DDPG(
            self.env,
            self.es,
            self.sum_policy,
            self.sum_critic,
            n_epochs=0,
            epoch_length=0,
            eval_samples=0,  # Ignore eval. Just do this to remove warnings.
            discount=discount,
        )
        old_qf_values = algo.qf.get_param_values()
        old_qf_copy_values = (algo.qf_with_action_input.get_param_values())
        old_policy_values = algo.policy.get_param_values()
        old_target_qf_values = algo.target_qf.get_param_values()
        old_target_policy_values = algo.target_policy.get_param_values()

        obs = np.array([[1., 1., 1., 1.]])
        feed_dict = algo._policy_feed_dict(obs)
        algo.sess.run(algo.train_policy_op, feed_dict=feed_dict)

        new_qf_values = algo.qf.get_param_values()
        new_qf_copy_values = (algo.qf_with_action_input.get_param_values())
        new_policy_values = algo.policy.get_param_values()
        new_target_qf_values = algo.target_qf.get_param_values()
        new_target_policy_values = algo.target_policy.get_param_values()

        self.assertFalse(
            are_np_array_iterables_equal(old_policy_values, new_policy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_qf_values, new_qf_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_qf_copy_values,
                                         new_qf_copy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_policy_values,
                                         new_target_policy_values))
        self.assertTrue(
            are_np_array_iterables_equal(old_target_qf_values,
                                         new_target_qf_values))