Ejemplo n.º 1
0
    def __init__(self,
                 env,
                 nb_steps,
                 kl_bound,
                 init_ctl_sigma,
                 activation=range(-1, 0)):

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.noise
        self.env_cost = self.env.unwrapped.cost
        self.env_init = self.env.unwrapped.init

        self.ulim = self.env.action_space.high

        self.nb_xdim = self.env.observation_space.shape[0]
        self.nb_udim = self.env.action_space.shape[0]
        self.nb_steps = nb_steps

        # total kl over traj.
        self.kl_base = kl_bound
        self.kl_bound = kl_bound

        # kl mult.
        self.kl_mult = 1.
        self.kl_mult_min = 0.1
        self.kl_mult_max = 5.0

        self.alpha = np.array([-100.])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.nb_xdim, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init()

        self.udist = Gaussian(self.nb_udim, self.nb_steps)
        self.xudist = Gaussian(self.nb_xdim + self.nb_udim, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.nb_xdim, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.nb_xdim, self.nb_udim,
                                               self.nb_steps)

        self.dyn = LearnedLinearGaussianDynamics(self.nb_xdim, self.nb_udim,
                                                 self.nb_steps)
        self.ctl = LinearGaussianControl(self.nb_xdim, self.nb_udim,
                                         self.nb_steps, init_ctl_sigma)

        # activation of cost function
        self.activation = np.zeros((self.nb_steps + 1, ), dtype=np.int64)
        self.activation[-1] = 1.  # last step always in
        self.activation[activation] = 1.

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.nb_xdim,
                                            self.nb_udim, self.nb_steps + 1)

        self.last_return = -np.inf

        self.data = {}
Ejemplo n.º 2
0
    def forward_pass(self, lgc):
        xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        udist = Gaussian(self.dm_act, self.nb_steps)
        xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        xdist.mu, xdist.sigma,\
        udist.mu, udist.sigma,\
        xudist.mu, xudist.sigma = forward_pass(self.xdist.mu[..., 0], self.xdist.sigma[..., 0],
                                               self.dyn.A, self.dyn.B, self.dyn.c, self.dyn.sigma,
                                               lgc.K, lgc.kff, lgc.sigma,
                                               self.dm_state, self.dm_act, self.nb_steps)
        return xdist, udist, xudist
Ejemplo n.º 3
0
    def __init__(self, env, nb_steps,
                 init_state, init_action_sigma=1.,
                 kl_bound=0.1, kl_adaptive=False):

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.noise
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.nb_steps = nb_steps

        # total kl over traj.
        self.kl_base = kl_bound
        self.kl_bound = kl_bound

        # kl mult.
        self.kl_adaptive = kl_adaptive
        self.kl_mult = 1.
        self.kl_mult_min = 0.1
        self.kl_mult_max = 5.0

        self.alpha = np.array([-1e4])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps)

        self.dyn = AnalyticalLinearGaussianDynamics(self.env_dyn, self.env_noise,
                                                    self.dm_state, self.dm_act, self.nb_steps)

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps, init_action_sigma)
        self.ctl.kff = 1e-2 * np.random.randn(self.dm_act, self.nb_steps)

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state, self.dm_act, self.nb_steps + 1)

        self.last_return = - np.inf
Ejemplo n.º 4
0
    def forward_pass(self, lgc):
        """
        Forward pass on linearized system
        :param lgc:
        :return:
        """
        xdist = Gaussian(self.nb_xdim, self.nb_steps + 1)
        udist = Gaussian(self.nb_udim, self.nb_steps)
        xudist = Gaussian(self.nb_xdim + self.nb_udim, self.nb_steps + 1)

        xdist.mu, xdist.sigma,\
        udist.mu, udist.sigma,\
        xudist.mu, xudist.sigma = forward_pass(self.xdist.mu[..., 0], self.xdist.sigma[..., 0],
                                               self.dyn.A, self.dyn.B, self.dyn.c, self.dyn.sigma,
                                               lgc.K, lgc.kff, lgc.sigma,
                                               self.nb_xdim, self.nb_udim, self.nb_steps)
        return xdist, udist, xudist
Ejemplo n.º 5
0
    def extended_kalman(self, lgc):
        """
        Forward pass on actual dynamics
        :param lgc:
        :return:
        """
        xdist = Gaussian(self.nb_xdim, self.nb_steps + 1)
        udist = Gaussian(self.nb_udim, self.nb_steps)
        cost = np.zeros((self.nb_steps + 1, ))

        xdist.mu[..., 0], xdist.sigma[..., 0] = self.dyn.evali()
        for t in range(self.nb_steps):
            udist.mu[..., t], udist.sigma[..., t] = lgc.forward(xdist, t)
            cost[..., t] = self.cost.evalf(xdist.mu[..., t], udist.mu[..., t],
                                           self.activation[t])
            xdist.mu[..., t + 1], xdist.sigma[..., t + 1] = self.dyn.forward(
                xdist, udist, lgc, t)

        cost[..., -1] = self.cost.evalf(xdist.mu[..., -1],
                                        np.zeros((self.nb_udim, )),
                                        self.activation[-1])
        return xdist, udist, cost
Ejemplo n.º 6
0
    def __init__(self,
                 env,
                 nb_steps,
                 kl_bound,
                 init_ctl_sigma,
                 activation=None):

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.noise
        self.env_cost = self.env.unwrapped.cost
        self.env_init = self.env.unwrapped.init

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.nb_steps = nb_steps

        # total kl over traj.
        self.kl_base = kl_bound
        self.kl_bound = kl_bound

        # kl mult.
        self.kl_mult = 1.
        self.kl_mult_min = 0.1
        self.kl_mult_max = 5.0

        self.alpha = np.array([-1e4])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init()

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        self.dyn = LearnedLinearGaussianDynamics(self.dm_state, self.dm_act,
                                                 self.nb_steps)
        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_ctl_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        else:
            _t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (_t - activation['shift'])))

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.last_return = -np.inf

        self.data = {}
Ejemplo n.º 7
0
    def __init__(self,
                 model,
                 reward,
                 horizon,
                 kl_bound,
                 u_lim,
                 init_ctl_sigma,
                 init_noise,
                 activation='last'):

        self.env = model

        self.env_dyn = self.env.dynamics
        self.env_noise = lambda x, u: self.env.sigV
        self.env_cost = reward
        self.env_init = self.env.init

        self.ulim = u_lim

        self.nb_xdim = self.env.dim_x
        self.nb_udim = self.env.dim_u
        self.nb_steps = horizon

        # total kl over traj.
        # total kl over traj.
        self.kl_base = kl_bound
        self.kl_bound = kl_bound

        # kl mult.
        self.kl_mult = 1.
        self.kl_mult_min = 0.1
        self.kl_mult_max = 5.0
        self.alpha = np.array([-100.])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.nb_xdim, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init()

        self.udist = Gaussian(self.nb_udim, self.nb_steps)
        self.xudist = Gaussian(self.nb_xdim + self.nb_udim, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.nb_xdim, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.nb_xdim, self.nb_udim,
                                               self.nb_steps)

        self.dyn = AnalyticalLinearGaussianDynamics(self.env_init,
                                                    self.env_dyn,
                                                    self.env_noise,
                                                    self.nb_xdim, self.nb_udim,
                                                    self.nb_steps)
        self.ctl = LinearGaussianControl(self.nb_xdim, self.nb_udim,
                                         self.nb_steps, init_ctl_sigma)
        self.ctl.kff = init_noise * np.random.randn(self.nb_udim,
                                                    self.nb_steps)

        if activation == 'all':
            self.activation = np.ones((self.nb_steps + 1, ), dtype=np.int64)
        else:
            self.activation = np.zeros((self.nb_steps + 1, ), dtype=np.int64)
            self.activation[-1] = 1
        self.cost = AnalyticalQuadraticCost(self.env_cost, self.nb_xdim,
                                            self.nb_udim, self.nb_steps + 1)
Ejemplo n.º 8
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 kl_bound=0.1,
                 kl_adaptive=False,
                 kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None):

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.noise
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.kl_stepwise = kl_stepwise
        if self.kl_stepwise:
            self.kl_base = kl_bound * np.ones((self.nb_steps, ))
            self.kl_bound = kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.kl_base = kl_bound * np.ones((1, ))
            self.kl_bound = kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        # kl mult.
        self.kl_adaptive = kl_adaptive
        self.kl_mult = 1.
        self.kl_mult_min = 0.1
        self.kl_mult_max = 5.0

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        self.dyn = AnalyticalLinearGaussianDynamics(self.env_dyn,
                                                    self.env_noise,
                                                    self.dm_state, self.dm_act,
                                                    self.nb_steps)

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)
        self.ctl.kff = 1e-4 * np.random.randn(self.dm_act, self.nb_steps)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.last_return = -np.inf