Beispiel #1
0
    def policy_backward_pass(self, alpha, agcost):
        lgc = LinearGaussianControl(self.dm_state, self.dm_act, self.nb_steps)
        xvalue = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        xuvalue = QuadraticStateActionValue(self.dm_state, self.dm_act, self.nb_steps)

        xuvalue.Qxx, xuvalue.Qux, xuvalue.Quu,\
        xuvalue.qx, xuvalue.qu, xuvalue.q0, \
        xvalue.V, xvalue.v, xvalue.v0, \
        lgc.K, lgc.kff, lgc.sigma, diverge = policy_backward_pass(agcost.Cxx, agcost.cx, agcost.Cuu,
                                                                  agcost.cu, agcost.Cxu, agcost.c0,
                                                                  self.param.mu, self.param.sigma, self.noise,
                                                                  alpha, self.dm_state, self.dm_act, self.nb_steps)
        return lgc, xvalue, xuvalue, diverge
Beispiel #2
0
    def parameter_backward_pass(self, beta, agcost, xdist):
        param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps)
        xvalue = QuadraticStateValue(self.dm_state, self.nb_steps + 1)

        xvalue.V, xvalue.v, xvalue.v0,\
        param.mu, param.sigma, diverge = parameter_backward_pass(xdist.mu, xdist.sigma,
                                                                 self.ctl.K, self.ctl.kff, self.ctl.sigma, self.noise,
                                                                 self.cost.cx, self.cost.Cxx, self.cost.Cuu,
                                                                 self.cost.cu, self.cost.Cxu, self.cost.c0,
                                                                 agcost.Cxx, agcost.cx, agcost.c0,
                                                                 beta, self.dm_state, self.dm_act, self.dm_param,
                                                                 self.nb_steps)
        return param, xvalue, diverge
Beispiel #3
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 policy_kl_bound=0.1,
                 param_kl_bound=100,
                 kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None):

        # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.sigma
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1)

        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.kl_stepwise = kl_stepwise
        if self.kl_stepwise:
            self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.policy_kl_bound = policy_kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        self.param_kl_bound = param_kl_bound
        self.beta = np.array([1e16])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act,
                                              self.nb_steps)

        # LQG dynamics
        # _A = np.array([[1.11627793, 0.00000000],
        #                [0.11107100, 1.10517083]])
        # _B = np.array([[0.10570721],
        #                [0.00536379]])
        # _c = np.array([-1.16277934, -2.16241837])

        _A = np.array([[9.99999500e-01, 9.99000500e-03],
                       [-9.99000500e-05, 9.98001499e-01]])
        _B = np.array([[4.99666792e-05], [9.99000500e-03]])
        _c = np.array([0., 0.])

        _params = np.hstack((_A, _B, _c[:, None]))
        for t in range(self.nb_steps):
            self.nominal.mu[..., t] = np.reshape(_params,
                                                 self.dm_param,
                                                 order='F')
            self.nominal.sigma[..., t] = 1e-8 * np.eye(self.dm_param)

        self.param = MatrixNormalParameters(self.dm_state, self.dm_act,
                                            self.nb_steps)

        # We assume process noise over dynamics is known
        self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps))
        for t in range(self.nb_steps):
            self.noise[..., t] = self.env_noise

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.last_return = -np.inf

        self.data = {}
Beispiel #4
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 policy_kl_bound=0.1,
                 param_kl_bound=100,
                 kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None,
                 prior=None):

        # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1)

        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.kl_stepwise = kl_stepwise
        if self.kl_stepwise:
            self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.policy_kl_bound = policy_kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        self.param_kl_bound = param_kl_bound
        self.beta = np.array([1e8])

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        self.nominal = LearnedProbabilisticLinearDynamics(
            self.dm_state, self.dm_act, self.nb_steps, prior)

        self.param = MatrixNormalParameters(self.dm_state, self.dm_act,
                                            self.nb_steps)

        self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps))

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.last_return = -np.inf

        self.data = {}
Beispiel #5
0
    def __init__(self,
                 env,
                 nb_steps,
                 init_state,
                 init_action_sigma=1.,
                 policy_kl_bound=0.1,
                 param_nominal_kl_bound=100.,
                 param_regularizer_kl_bound=1.,
                 policy_kl_stepwise=False,
                 activation=None,
                 slew_rate=False,
                 action_penalty=None,
                 nominal_variance=1e-8):

        # logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

        self.env = env

        # expose necessary functions
        self.env_dyn = self.env.unwrapped.dynamics
        self.env_noise = self.env.unwrapped.sigma
        self.env_cost = self.env.unwrapped.cost
        self.env_init = init_state

        self.ulim = self.env.action_space.high

        self.dm_state = self.env.observation_space.shape[0]
        self.dm_act = self.env.action_space.shape[0]
        self.dm_param = self.dm_state * (self.dm_act + self.dm_state + 1)

        self.nb_steps = nb_steps

        # use slew rate penalty or not
        self.env.unwrapped.slew_rate = slew_rate
        if action_penalty is not None:
            self.env.unwrapped.uw = action_penalty * np.ones((self.dm_act, ))

        self.policy_kl_stepwise = policy_kl_stepwise
        if self.policy_kl_stepwise:
            self.policy_kl_bound = policy_kl_bound * np.ones((self.nb_steps, ))
            self.alpha = 1e8 * np.ones((self.nb_steps, ))
        else:
            self.policy_kl_bound = policy_kl_bound * np.ones((1, ))
            self.alpha = 1e8 * np.ones((1, ))

        self.param_nominal_kl_bound = param_nominal_kl_bound * np.ones((1, ))
        self.beta = 1e16 * np.ones((1, ))

        self.param_regularizer_kl_bound = param_regularizer_kl_bound * np.ones(
            (1, ))
        self.eta = 1e16 * np.ones((1, ))

        # create state distribution and initialize first time step
        self.xdist = Gaussian(self.dm_state, self.nb_steps + 1)
        self.xdist.mu[..., 0], self.xdist.sigma[..., 0] = self.env_init

        self.udist = Gaussian(self.dm_act, self.nb_steps)
        self.xudist = Gaussian(self.dm_state + self.dm_act, self.nb_steps + 1)

        self.vfunc = QuadraticStateValue(self.dm_state, self.nb_steps + 1)
        self.qfunc = QuadraticStateActionValue(self.dm_state, self.dm_act,
                                               self.nb_steps)

        # We assume process noise over dynamics is known
        self.noise = np.zeros((self.dm_state, self.dm_state, self.nb_steps))
        for t in range(self.nb_steps):
            self.noise[..., t] = self.env_noise

        self.param = MatrixNormalParameters(self.dm_state, self.dm_act,
                                            self.nb_steps)
        self.nominal = MatrixNormalParameters(self.dm_state, self.dm_act,
                                              self.nb_steps)

        # LQG dynamics
        from autograd import jacobian

        input = tuple([np.zeros((self.dm_state, )), np.zeros((self.dm_act, ))])
        A = jacobian(self.env.unwrapped.dynamics, 0)(*input)
        B = jacobian(self.env.unwrapped.dynamics, 1)(*input)
        c = self.env.unwrapped.dynamics(*input)

        tmp = np.hstack((A, B, c[:, None]))
        for t in range(self.nb_steps):
            self.nominal.mu[..., t] = np.reshape(tmp, self.dm_param, order='F')
            self.nominal.sigma[...,
                               t] = nominal_variance * np.eye(self.dm_param)

        self.ctl = LinearGaussianControl(self.dm_state, self.dm_act,
                                         self.nb_steps, init_action_sigma)

        # activation of cost function in shape of sigmoid
        if activation is None:
            self.weighting = np.ones((self.nb_steps + 1, ))
        elif "mult" and "shift" in activation:
            t = np.linspace(0, self.nb_steps, self.nb_steps + 1)
            self.weighting = 1. / (1. + np.exp(-activation['mult'] *
                                               (t - activation['shift'])))
        elif "discount" in activation:
            self.weighting = np.ones((self.nb_steps + 1, ))
            gamma = activation["discount"] * np.ones((self.nb_steps, ))
            self.weighting[1:] = np.cumprod(gamma)
        else:
            raise NotImplementedError

        self.cost = AnalyticalQuadraticCost(self.env_cost, self.dm_state,
                                            self.dm_act, self.nb_steps + 1)

        self.data = {}