Beispiel #1
0
 def _kl(params):
     """ Compute KL between old policy and policy using given params"""
     unflatten_tensor(params, self._opt_policy_parameters)
     opt_policy_state = self._opt_policy.get_initial_state(batch_size)
     dists = self._opt_policy.distribution(time_steps, opt_policy_state)
     policy_distribution = dists.action
     kl = self._kl_divergence(time_steps,
                              action_distribution_parameters,
                              policy_distribution)
     return tf.reduce_mean(kl)
Beispiel #2
0
    def _line_search(self, time_steps, policy_steps_, advantages,
                     natural_gradient, coeff, weights):
        """Find new policy parameters by line search in natural gradient direction"""

        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]

        # old policy distribution
        action_distribution_parameters = policy_steps_.info
        actions = policy_steps_.action
        actions_distribution = distribution_spec.nested_distributions_from_specs(
            self._action_distribution_spec,
            action_distribution_parameters["dist_params"])
        act_log_probs = common.log_probability(actions_distribution, actions,
                                               self._action_spec)

        # loss for the old policy
        loss_threshold = self.policy_gradient_loss(
            time_steps,
            actions,
            tf.stop_gradient(act_log_probs),
            tf.stop_gradient(advantages),
            actions_distribution,
            weights,
        )

        policy_params = flatten_tensors(self._actor_net.trainable_variables)

        # try different steps_sizes, accept first one that improves loss and satisfies KL constraint
        for it in range(self._backtrack_iters):
            new_params = policy_params - self._backtrack_coeff**it * coeff * natural_gradient

            unflatten_tensor(new_params, self._opt_policy_parameters)
            opt_policy_state = self._opt_policy.get_initial_state(batch_size)
            dists = self._opt_policy.distribution(time_steps, opt_policy_state)
            new_policy_distribution = dists.action

            kl = tf.reduce_mean(
                self._kl_divergence(time_steps, action_distribution_parameters,
                                    new_policy_distribution))
            loss = self.policy_gradient_loss(
                time_steps,
                actions,
                tf.stop_gradient(act_log_probs),
                tf.stop_gradient(advantages),
                new_policy_distribution,
                weights,
            )
            if kl < self._max_kl and loss < loss_threshold:
                return new_params

        # no improvement found
        return policy_params
Beispiel #3
0
def test_flatten_unflatten(tensor_list):
    """test unflattened tensorlist matches tensorlist before flattening it"""
    variables = [tf.Variable(np.zeros_like(t.numpy())) for t in tensor_list]
    flat = flatten_tensors(tensor_list)
    unflattened = unflatten_tensor(flat, variables)
    for before, after in zip(tensor_list, unflattened):
        np.testing.assert_array_almost_equal(before.numpy(), after.numpy())
Beispiel #4
0
def test_unflatten_size(shape_list, vector):
    """ Test unflatten returns list of variables with correct shapes"""
    var_tensors = [
        tf.Variable(np.zeros(shape), dtype=np.float32) for shape in shape_list
    ]
    unflattened = unflatten_tensor(vector, var_tensors)
    for t, s in zip(unflattened, shape_list):
        assert tuple(tf.shape(t)) == tuple(s)
Beispiel #5
0
    def _update_policy(self, time_steps, policy_steps_, advantages, weights):
        """Update policy parameters by computing natural gradient and step_size"""

        policy_gradient_loss, policy_grad = self.policy_gradient(
            time_steps, policy_steps_, advantages, weights)

        natural_gradient, coeff = self.natural_policy_gradient(
            time_steps, policy_steps_, policy_grad, weights)

        # find best step size in natural gradient direction
        new_params = self._line_search(time_steps, policy_steps_, advantages,
                                       natural_gradient, coeff, weights)

        tf.debugging.check_numerics(new_params,
                                    "Updated policy parameters",
                                    name="new_params_check")
        unflatten_tensor(new_params, self._actor_net.trainable_variables)

        return policy_gradient_loss