def __init__(self, epsilon_mean=0.0, epsilon_var=0.0, regularization=False): super().__init__() if epsilon_var is None: self.separated_kl = False epsilon_var = 0.0 else: self.separated_kl = True self.regularization = regularization if self.regularization: eta_mean = epsilon_mean eta_var = epsilon_var if not isinstance(eta_mean, ParameterDecay): eta_mean = Constant(eta_mean) if not isinstance(eta_var, ParameterDecay): eta_var = Constant(eta_var) self._eta_mean = eta_mean self._eta_var = eta_var self.epsilon_mean = torch.tensor(0.0) self.epsilon_var = torch.tensor(0.0) else: # Trust-Region: || KL(q || \pi_old) || < \epsilon self._eta_mean = Learnable(1.0, positive=True) self._eta_var = Learnable(1.0, positive=True) self.epsilon_mean = torch.tensor(epsilon_mean) self.epsilon_var = torch.tensor(epsilon_var)
def __init__(self, eta, entropy_regularization=False, learn_policy=True, *args, **kwargs): super().__init__(*args, **kwargs) self.learn_policy = learn_policy if entropy_regularization: if not isinstance(eta, ParameterDecay): eta = Constant(eta) self.eta = eta self.epsilon = torch.tensor(0.0) else: self.eta = Learnable(1.0, positive=True) self.epsilon = torch.tensor(eta)
def __init__(self, eta=0.0, target_entropy=0.0, regularization=True): super().__init__() # Actor self.target_entropy = target_entropy # -self.policy.dim_action[0] self.regularization = regularization if regularization: # Regularization: \eta KL(\pi || Uniform) if not isinstance(eta, ParameterDecay): eta = Constant(eta) self._eta = eta else: # Trust-Region: || KL(\pi || Uniform) - target|| < \epsilon if isinstance(eta, ParameterDecay): eta = eta() self._eta = Learnable(eta, positive=True)
def __init__(self, dual=0.0, inequality_zero=0.0, regularization=False): super().__init__() # Actor self.inequality_zero = inequality_zero self.regularization = regularization if regularization: # Regularization: \dual g(x) if not isinstance(dual, ParameterDecay): dual = Constant(dual) self._dual = dual else: # Constraint: g(x) < epsilon if isinstance(dual, ParameterDecay): dual = dual() self._dual = Learnable(dual, positive=True)
def __init__(self, epsilon=0.1, relent_regularization=False): super().__init__() if relent_regularization: eta = epsilon if not isinstance(eta, ParameterDecay): eta = Constant(eta) self._eta = eta self.epsilon = torch.tensor(0.0) else: # Trust-Region: || KL(p || q) || < \epsilon self._eta = Learnable(1.0, positive=True) self.epsilon = torch.tensor(epsilon)
class REPS(AbstractAlgorithm): r"""Relative Entropy Policy Search Algorithm. REPS optimizes the following regularized LP over the set of distributions \mu(X, A). ..math:: \max \mu r - eta R(\mu, d_0) ..math:: s.t. \sum_a \mu(x, a) = \sum_{x', a'} = \mu(x', a') P(x|x', a'), where R is the relative entropy between \mu and any distribution d. This differs from the original formulation in which R(\mu, d) is used to express a trust region. The dual of the LP is: ..math:: G(V) = \eta \log \sum_{x, a} d_0(x, a) \exp^{\delta(x, a) / \eta} where \delta(x,a) = r + \sum_{x'} P(x'|x, a) V(x') - V(x) is the TD-error and V(x) are the dual variables associated with the stationary constraints in the primal. V(x) is usually referred to as the value function. Using d(x,a) as the empirical distribution, G(V) can be approximated by samples. The optimal policy is given by: ..math:: \pi(a|x) \propto d_0(x, a) \exp^{\delta(x, a) / \eta}. Instead of setting the policy to \pi(a|x) at sampled (x, a), we can fit the policy by minimizing the negative log-likelihood at the sampled elements. Calling REPS() returns a sampled based estimate of G(V) and the NLL of the policy. Both G(V) and NLL lend are differentiable and lend themselves to gradient based optimization. References ---------- Peters, J., Mulling, K., & Altun, Y. (2010, July). Relative entropy policy search. AAAI. Deisenroth, M. P., Neumann, G., & Peters, J. (2013). A survey on policy search for robotics. Foundations and Trends® in Robotics. """ def __init__(self, eta, entropy_regularization=False, learn_policy=True, *args, **kwargs): super().__init__(*args, **kwargs) self.learn_policy = learn_policy if entropy_regularization: if not isinstance(eta, ParameterDecay): eta = Constant(eta) self.eta = eta self.epsilon = torch.tensor(0.0) else: self.eta = Learnable(1.0, positive=True) self.epsilon = torch.tensor(eta) def _policy_weighted_nll(self, state, action, weights): """Return weighted policy negative log-likelihood.""" pi = tensor_to_distribution(self.policy(state), **self.policy.dist_params) _, action_log_p = get_entropy_and_log_p(pi, action, self.policy.action_scale) weighted_log_p = weights.detach() * action_log_p # Clamping is crucial for stability so that it does not converge to a delta. log_likelihood = torch.mean(weighted_log_p.clamp_max(1e-3)) return -log_likelihood def get_value_target(self, observation): """Get value-function target.""" next_v = self.critic(observation.next_state) * (1 - observation.done) return self.get_reward(observation) + self.gamma * next_v def actor_loss(self, observation): """Return primal and dual loss terms from REPS.""" state, action, reward, next_state, done, *r = observation # Compute Scaled TD-Errors value = self.critic(state) # For dual function we need the full gradient, not the semi gradient! target = self.get_value_target(observation) td = target - value weights = td / self.eta() normalizer = torch.logsumexp(weights, dim=0) dual = self.eta() * (self.epsilon + normalizer) + (1.0 - self.gamma) * value nll = self._policy_weighted_nll(state, action, weights) return Loss(dual_loss=dual.mean(), policy_loss=nll, td_error=td) def update(self): """Update regularization parameter.""" super().update() self.eta.update()