Ejemplo n.º 1
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 policy_kl_bound=0.1,
                 param_kl_bound=100,
                 kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None):

        # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.sigma
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1)

        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.kl_stepwise = kl_stepwise
        if self.kl_stepwise:
            self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.policy_kl_bound = policy_kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        self.param_kl_bound = param_kl_bound
        self.beta = np.array([1e16])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act,
                                              self.nb_steps)

        # LQG dynamics
        # _A = np.array([[1.11627793, 0.00000000],
        #                [0.11107100, 1.10517083]])
        # _B = np.array([[0.10570721],
        #                [0.00536379]])
        # _c = np.array([-1.16277934, -2.16241837])

        _A = np.array([[9.99999500e-01, 9.99000500e-03],
                       [-9.99000500e-05, 9.98001499e-01]])
        _B = np.array([[4.99666792e-05], [9.99000500e-03]])
        _c = np.array([0., 0.])

        _params = np.hstack((_A, _B, _c[:, None]))
        for t in range(self.nb_steps):
            self.nominal.mu[..., t] = np.reshape(_params,
                                                 self.dm_param,
                                                 order='F')
            self.nominal.sigma[..., t] = 1e-8 * np.eye(self.dm_param)

        self.param = MatrixNormalParameters(self.dm_state, self.dm_act,
                                            self.nb_steps)

        # We assume process noise over dynamics is known
        self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps))
        for t in range(self.nb_steps):
            self.noise[..., t] = self.env_noise

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.last_return = -np.inf

        self.data = {}
Ejemplo n.º 2
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 policy_kl_bound=0.1,
                 param_kl_bound=100,
                 kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None,
                 prior=None):

        # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1)

        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.kl_stepwise = kl_stepwise
        if self.kl_stepwise:
            self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.policy_kl_bound = policy_kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        self.param_kl_bound = param_kl_bound
        self.beta = np.array([1e8])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        self.nominal = LearnedProbabilisticLinearDynamics(
            self.dm_state, self.dm_act, self.nb_steps, prior)

        self.param = MatrixNormalParameters(self.dm_state, self.dm_act,
                                            self.nb_steps)

        self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps))

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.last_return = -np.inf

        self.data = {}
Ejemplo n.º 3
0
    def cubature_forward_pass(self, lgc, param):
        xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        udist = Gaussian(self.dm_act, self.nb_steps)
        xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        xdist.mu, xdist.sigma,\
        udist.mu, udist.sigma,\
        xudist.mu, xudist.sigma = cubature_forward_pass(self.xdist.mu[..., 0], self.xdist.sigma[..., 0],
                                                        param.mu, param.sigma, self.noise,
                                                        lgc.K, lgc.kff, lgc.sigma,
                                                        self.dm_state, self.dm_act, self.nb_steps)

        # dm_augmented = self.dm_state + self.dm_act + 1 + self.dm_state
        # chol_sigma_augmented = 1e-8 * np.eye(dm_augmented)
        #
        # input_cubature_points = np.zeros((dm_augmented, 2 * dm_augmented))
        # output_cubature_points = np.zeros((self.dm_state, 2 * dm_augmented))
        #
        # xdist.mu[..., 0] = self.xdist.mu[..., 0]
        # xdist.sigma[..., 0] = self.xdist.sigma[..., 0]
        # for t in range(self.nb_steps):
        #     At = param.mu[0: self.dm_state * self.dm_state, t]
        #     Bt = param.mu[self.dm_state * self.dm_state: self.dm_state * self.dm_state + self.dm_state * self.dm_act, t]
        #     ct = param.mu[self.dm_state * self.dm_state + self.dm_state * self.dm_act:
        #                   self.dm_state * self.dm_state + self.dm_state * self.dm_act + self.dm_state, t]
        #
        #     At = np.reshape(At, (self.dm_state, self.dm_state), order='F')
        #     Bt = np.reshape(Bt, (self.dm_state, self.dm_act), order='F')
        #     ct = np.reshape(ct, (self.dm_state, 1), order='F')
        #
        #     udist.mu[..., t] = lgc.K[..., t] @ xdist.mu[..., t] + lgc.kff[..., t]
        #     udist.sigma[..., t] = lgc.sigma[..., t] + lgc.K[..., t] @ xdist.sigma[..., t] @ lgc.K[..., t].T
        #     udist.sigma[..., t] = 0.5 * (udist.sigma[..., t] + udist.sigma[..., t].T)
        #     udist.sigma[..., t] += 1e-8 * np.eye(self.dm_act)
        #
        #     xudist.mu[..., t] = np.hstack((xdist.mu[..., t], udist.mu[..., t]))
        #     xudist.sigma[..., t] = np.block([[xdist.sigma[..., t], xdist.sigma[..., t] @ lgc.K[..., t].T],
        #                                      [lgc.K[..., t] @ xdist.sigma[..., t], udist.sigma[..., t]]])
        #     xudist.sigma[..., t] += 1e-8 * np.eye(self.dm_state + self.dm_act)
        #
        #     mu_augmented = np.hstack((xudist.mu[..., t], 1., np.zeros((self.dm_state, ))))
        #     chol_sigma_augmented[0: self.dm_state + self.dm_act,
        #                          0: self.dm_state + self.dm_act] = np.linalg.cholesky(xudist.sigma[..., t])
        #     chol_sigma_augmented[-self.dm_state:, -self.dm_state:] = np.eye(self.dm_state)
        #
        #     input_cubature_points = np.hstack((chol_sigma_augmented, - chol_sigma_augmented))
        #     input_cubature_points *= np.sqrt(dm_augmented)
        #     input_cubature_points += mu_augmented[:, None]
        #
        #     for k in range(2 * dm_augmented):
        #         vec_xu = input_cubature_points[:, k]
        #         mat_xu = np.kron(vec_xu[:self.dm_state + self.dm_act + 1], np.eye(self.dm_state))
        #         covar = self.noise[..., t] + mat_xu @ param.sigma[..., t] @ mat_xu.T
        #         covar = 0.5 * (covar + covar.T)
        #
        #         chol_covar = np.linalg.cholesky(covar)
        #         output_cubature_points[:, k] = np.hstack((At, Bt, ct, chol_covar)) @ vec_xu
        #
        #     xdist.mu[..., t + 1] = np.mean(output_cubature_points, axis=-1)
        #
        #     xdist.sigma[..., t + 1] = np.zeros((self.dm_state, self.dm_state))
        #     for k in range(2 * dm_augmented):
        #         diff = output_cubature_points[:, k] - xdist.mu[..., t + 1]
        #         xdist.sigma[..., t + 1] += diff[:, None] @ diff[:, None].T
        #
        #     xdist.sigma[..., t + 1] /= (2 * dm_augmented)
        #     xdist.sigma[..., t + 1] = 0.5 * (xdist.sigma[..., t + 1] + xdist.sigma[..., t + 1].T)
        #
        #     if t == self.nb_steps - 1:
        #         xudist.mu[..., t + 1] = np.hstack((xdist.mu[..., t + 1], np.zeros((self.dm_act, ))))
        #         xudist.sigma[:self.dm_state, :self.dm_state, t + 1] = xdist.sigma[..., t + 1]

        return xdist, udist, xudist
Ejemplo n.º 4
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 policy_kl_bound=0.1,
                 param_nominal_kl_bound=100.,
                 param_regularizer_kl_bound=1.,
                 policy_kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None,
                 nominal_variance=1e-8):

        # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.sigma
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1)

        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.policy_kl_stepwise = policy_kl_stepwise
        if self.policy_kl_stepwise:
            self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.policy_kl_bound = policy_kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        self.param_nominal_kl_bound = param_nominal_kl_bound * np.ones((1, ))
        self.beta = 1e16 * np.ones((1, ))

        self.param_regularizer_kl_bound = param_regularizer_kl_bound * np.ones(
            (1, ))
        self.eta = 1e16 * np.ones((1, ))

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        # We assume process noise over dynamics is known
        self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps))
        for t in range(self.nb_steps):
            self.noise[..., t] = self.env_noise

        self.param = MatrixNormalParameters(self.dm_state, self.dm_act,
                                            self.nb_steps)
        self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act,
                                              self.nb_steps)

        # LQG dynamics
        from autograd import jacobian

        input = tuple([np.zeros((self.dm_state, )), np.zeros((self.dm_act, ))])
        A = jacobian(self.env.unwrapped.dynamics, 0)(*input)
        B = jacobian(self.env.unwrapped.dynamics, 1)(*input)
        c = self.env.unwrapped.dynamics(*input)

        tmp = np.hstack((A, B, c[:, None]))
        for t in range(self.nb_steps):
            self.nominal.mu[..., t] = np.reshape(tmp, self.dm_param, order='F')
            self.nominal.sigma[...,
                               t] = nominal_variance * np.eye(self.dm_param)

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.data = {}