def _kl(params): """ Compute KL between old policy and policy using given params""" unflatten_tensor(params, self._opt_policy_parameters) opt_policy_state = self._opt_policy.get_initial_state(batch_size) dists = self._opt_policy.distribution(time_steps, opt_policy_state) policy_distribution = dists.action kl = self._kl_divergence(time_steps, action_distribution_parameters, policy_distribution) return tf.reduce_mean(kl)
def _line_search(self, time_steps, policy_steps_, advantages, natural_gradient, coeff, weights): """Find new policy parameters by line search in natural gradient direction""" batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] # old policy distribution action_distribution_parameters = policy_steps_.info actions = policy_steps_.action actions_distribution = distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters["dist_params"]) act_log_probs = common.log_probability(actions_distribution, actions, self._action_spec) # loss for the old policy loss_threshold = self.policy_gradient_loss( time_steps, actions, tf.stop_gradient(act_log_probs), tf.stop_gradient(advantages), actions_distribution, weights, ) policy_params = flatten_tensors(self._actor_net.trainable_variables) # try different steps_sizes, accept first one that improves loss and satisfies KL constraint for it in range(self._backtrack_iters): new_params = policy_params - self._backtrack_coeff**it * coeff * natural_gradient unflatten_tensor(new_params, self._opt_policy_parameters) opt_policy_state = self._opt_policy.get_initial_state(batch_size) dists = self._opt_policy.distribution(time_steps, opt_policy_state) new_policy_distribution = dists.action kl = tf.reduce_mean( self._kl_divergence(time_steps, action_distribution_parameters, new_policy_distribution)) loss = self.policy_gradient_loss( time_steps, actions, tf.stop_gradient(act_log_probs), tf.stop_gradient(advantages), new_policy_distribution, weights, ) if kl < self._max_kl and loss < loss_threshold: return new_params # no improvement found return policy_params
def test_flatten_unflatten(tensor_list): """test unflattened tensorlist matches tensorlist before flattening it""" variables = [tf.Variable(np.zeros_like(t.numpy())) for t in tensor_list] flat = flatten_tensors(tensor_list) unflattened = unflatten_tensor(flat, variables) for before, after in zip(tensor_list, unflattened): np.testing.assert_array_almost_equal(before.numpy(), after.numpy())
def test_unflatten_size(shape_list, vector): """ Test unflatten returns list of variables with correct shapes""" var_tensors = [ tf.Variable(np.zeros(shape), dtype=np.float32) for shape in shape_list ] unflattened = unflatten_tensor(vector, var_tensors) for t, s in zip(unflattened, shape_list): assert tuple(tf.shape(t)) == tuple(s)
def _update_policy(self, time_steps, policy_steps_, advantages, weights): """Update policy parameters by computing natural gradient and step_size""" policy_gradient_loss, policy_grad = self.policy_gradient( time_steps, policy_steps_, advantages, weights) natural_gradient, coeff = self.natural_policy_gradient( time_steps, policy_steps_, policy_grad, weights) # find best step size in natural gradient direction new_params = self._line_search(time_steps, policy_steps_, advantages, natural_gradient, coeff, weights) tf.debugging.check_numerics(new_params, "Updated policy parameters", name="new_params_check") unflatten_tensor(new_params, self._actor_net.trainable_variables) return policy_gradient_loss