Ejemplo n.º 1
0
    def forward(self, obs: to.Tensor) -> to.Tensor:
        """
        Control step of energy-based controller which is used in the swing-up controller

        :param obs: observations pre-processed in the `forward` method of `QQubeSwingUpAndBalanceCtrl`
        :return: action
        """
        # Reconstruct partial state
        th, al, thd, ald = obs

        # Compute energies
        J_pole = self._domain_param["mass_pend_pole"] * self._domain_param[
            "length_pend_pole"]**2 / 12.0
        E_kin = 0.5 * J_pole * ald**2
        E_pot = (0.5 * self._domain_param["mass_pend_pole"] *
                 self._domain_param["gravity_const"] *
                 self._domain_param["length_pend_pole"] * (1.0 - to.cos(al)))
        E = E_kin + E_pot

        # Compute clipped action
        u = self.E_gain * (E - self.E_ref) * to.sign(
            ald * to.cos(al)) - self._th_gain * th
        acc = clamp_symm(u, self.acc_max)
        trq = self._domain_param["mass_rot_pole"] * self._domain_param[
            "length_rot_pole"] * acc
        volt = self._domain_param["motor_resistance"] / self._domain_param[
            "motor_back_emf"] * trq
        return volt.view(1)
Ejemplo n.º 2
0
    def forward(self, obs: to.Tensor) -> to.Tensor:
        """
        Calculate the controller output.

        :param obs: observation from the environment
        :return act: controller output [V]
        """
        x, sin_th, cos_th, x_dot, theta_dot = obs
        theta = to.atan2(sin_th, cos_th)
        alpha = (theta - math.pi) if theta > 0 else (theta + math.pi)

        J_pole = self.dp_nom["pole_length"]**2 * self.dp_nom["pole_mass"] / 3.0
        J_eq = (
            self.dp_nom["cart_mass"] +
            (self.dp_nom["gear_efficiency"] * self.dp_nom["gear_ratio"]**2 *
             self.dp_nom["motor_inertia"]) / self.dp_nom["pinion_radius"]**2)

        # Energy terms: E_pot(0) = 0; E_pot(pi) = E_pot(-pi) = 2 mgl
        E_kin = J_pole / 2.0 * theta_dot**2
        E_pot = self.dp_nom["pole_mass"] * self.dp_nom[
            "gravity_const"] * self.dp_nom["pole_length"] * (1 - cos_th)
        E_ref = 2.0 * self.dp_nom["pole_mass"] * self.dp_nom[
            "gravity_const"] * self.dp_nom["pole_length"]

        if to.abs(alpha) < 0.1745 or self.pd_control:
            # Stabilize at the top
            self.pd_activated = True
            u = self.K_pd.dot(to.tensor([x, alpha, x_dot, theta_dot]))
        else:
            # Swing up
            u = self.k_e * (E_kin + E_pot - E_ref) * to.sign(
                theta_dot * cos_th) + self.k_p * (0.0 - x)
            u = clamp_symm(u, self.u_max)

            if self.pd_activated:
                self.pd_activated = False

        act = (J_eq * self.dp_nom["motor_resistance"] *
               self.dp_nom["pinion_radius"] * u) / (
                   self.dp_nom["gear_efficiency"] * self.dp_nom["gear_ratio"] *
                   self.dp_nom["motor_efficiency"] *
                   self.dp_nom["motor_back_emf"]
               ) + self.dp_nom["gear_ratio"] * self.dp_nom[
                   "motor_back_emf"] * x_dot / self.dp_nom["pinion_radius"]

        # Return the clipped action
        return act.view(
            1
        )  # such that when act is later converted to numpy it does not become a float
    def forward(self, obs: to.Tensor) -> to.Tensor:
        """
        Control step of energy-based controller which is used in the swing-up controller

        :param obs: observations pre-processed in the `forward` method of `QQubeSwingUpAndBalanceCtrl`
        :return: action
        """
        # Reconstruct partial state
        th, al, thd, ald = obs

        # Compute energies
        J_pole = self.dp_nom['Mp']*self.dp_nom['Lp']**2/12.
        E_kin = 0.5*J_pole*ald**2
        E_pot = 0.5*self.dp_nom['Mp']*self.dp_nom['g']*self.dp_nom['Lp']*(1. - to.cos(al))
        E = E_kin + E_pot

        # Compute clipped action
        u = self.E_gain*(E - self.E_ref)*to.sign(ald*to.cos(al)) - self._th_gain*th
        acc = clamp_symm(u, self.acc_max)
        trq = self.dp_nom['Mr']*self.dp_nom['Lr']*acc
        volt = self.dp_nom['Rm']/self.dp_nom['km']*trq
        return volt.unsqueeze(0)
Ejemplo n.º 4
0
    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results[1:].mean_returns

        # Rank policy parameters by return (a.k.a. fitness)
        rets = rank_transform(
            rets_avg_ros) if self.transform_returns else rets_avg_ros

        # Move to PyTorch
        rets = to.from_numpy(rets).to(to.get_default_dtype())
        rets_max = to.max(rets)
        rets_avg_symm = (rets[:len(param_results) // 2] +
                         rets[len(param_results) // 2:]) / 2.
        baseline = to.mean(rets)  # zero if centered

        # Compute finite differences for the average return of each solution
        rets_fds = rets[:len(param_results) // 2] - rets[len(param_results) //
                                                         2:]

        # Get the perturbations (select the first half since they are symmetric)
        epsilon = param_results.parameters[:len(param_results) //
                                           2, :] - self._policy.param_values

        if self.normalize_update:
            # See equation (15, top) in [1]
            delta_mean = (rets_fds / (2 * rets_max - rets_fds + 1e-6)
                          ) @ epsilon  # epsilon = T from [1]
        else:
            # See equation (13) in [1]
            delta_mean = 0.5 * rets_fds @ epsilon  # epsilon = T from [1]

        # Update the mean
        self.optim.zero_grad()
        self._policy.param_grad = -delta_mean  # PyTorch optimizers are minimizers
        self.optim.step()
        # Old version without PyTorch optimizer: self._expl_strat.policy.param_values += delta_mean * self.lr

        # Update the std
        S = (epsilon**2 - self._expl_strat.std**2) / self._expl_strat.std

        if self.normalize_update:
            # See equation (15, bottom) in [1]
            delta_std = (rets_avg_symm - baseline) @ S
        else:
            # See equation (14) in [1]
            delta_std = ((rets_avg_symm - baseline) /
                         (rets_max - baseline + 1e-6)) @ S

        # Bound the change on the exploration standard deviation (i.e. the entropy)
        delta_std *= self.lr
        delta_std = clamp_symm(delta_std,
                               self.clip_ratio_std * self._expl_strat.std)
        new_std = self._expl_strat.std + delta_std

        self._expl_strat.adapt(std=new_std)

        # Logging
        self.logger.add_value('policy param', self._policy.param_values, 4)
        self.logger.add_value('delta policy param', delta_mean * self.lr, 4)
        self.logger.add_value('expl strat std', self._expl_strat.std, 4)
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy(), 4)