Esempio n. 1
0
    def parameter_dual(self, beta):

        agcost = self.parameter_augment_cost(beta)

        # # initialize adversial xdist with wide param dist.
        # param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps)
        # q_xdist, _, _ = self.cubature_forward_pass(self.ctl, param)

        # initial adversial xdist. with policy xdist.
        q_xdist = deepcopy(self.xdist)

        # first iteration to establish initial conv_kl
        param, xvalue, diverge = self.parameter_backward_pass(
            beta, agcost, q_xdist)
        if diverge != -1:
            return np.nan, np.nan
        p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param)

        # conergence of inner loop
        xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu,
                                        q_xdist.sigma, self.dm_state,
                                        self.nb_steps + 1)
        while np.any(xdist_kl > 1e-3):
            param, xvalue, diverge = self.parameter_backward_pass(
                beta, agcost, q_xdist)
            if diverge != -1:
                return np.nan, np.nan
            p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param)

            # check convergence of loop
            xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma,
                                            q_xdist.mu, q_xdist.sigma,
                                            self.dm_state, self.nb_steps + 1)

            # interpolate between distributions
            q_xdist.mu, q_xdist.sigma = self.interp_gauss_kl(
                q_xdist.mu, q_xdist.sigma, p_xdist.mu, p_xdist.sigma, 1e-1)
        # dual expectation
        dual = quad_expectation(q_xdist.mu[..., 0], q_xdist.sigma[..., 0],
                                xvalue.V[..., 0], xvalue.v[..., 0],
                                xvalue.v0[..., 0])

        dual += beta * (np.sum(self.parameter_kldiv(param)) -
                        self.param_kl_bound)

        # dual gradient
        grad = np.sum(self.parameter_kldiv(param)) - self.param_kl_bound

        return -1. * np.array([dual]), -1. * np.array([grad])
Esempio n. 2
0
    def regularized_parameter_dual(self, eta, ctl, last):

        agcost = self.regularized_parameter_augment_cost(eta, last)

        # initial adversial xdist. with policy xdist.
        q_xdist = deepcopy(self.xdist)

        # first iteration to establish initial conv_kl
        param, xvalue, diverge = self.parameter_backward_pass(
            0., agcost, q_xdist, ctl, eta)
        if diverge != -1:
            return np.nan, np.nan
        p_xdist, _, _ = self.cubature_forward_pass(ctl, param)

        # convergence of inner loop
        xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu,
                                        q_xdist.sigma, self.dm_state,
                                        self.nb_steps + 1)
        while np.any(xdist_kl > 1e-3):
            param, xvalue, diverge = self.parameter_backward_pass(
                0., agcost, q_xdist, ctl, eta)
            if diverge != -1:
                return np.nan, np.nan
            p_xdist, _, _ = self.cubature_forward_pass(ctl, param)

            # check convergence of loop
            xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma,
                                            q_xdist.mu, q_xdist.sigma,
                                            self.dm_state, self.nb_steps + 1)

            # interpolate between distributions
            q_xdist.mu, q_xdist.sigma = self.interp_gauss_kl(
                q_xdist.mu, q_xdist.sigma, p_xdist.mu, p_xdist.sigma, 1e-1)
        # dual expectation
        dual = quad_expectation(q_xdist.mu[..., 0], q_xdist.sigma[..., 0],
                                xvalue.V[..., 0], xvalue.v[..., 0],
                                xvalue.v0[..., 0])

        dual = np.array([
            dual
        ]) + eta * (np.sum(self.parameter_regularizer_kldiv(last, param)) -
                    self.param_regularizer_kl_bound)
        grad = np.sum(self.parameter_regularizer_kldiv(
            last, param)) - self.param_regularizer_kl_bound

        return -1. * dual, -1. * grad
Esempio n. 3
0
    def regularized_parameter_dual(self, beta, kappa):
        agcost = self.parameter_augment_cost(beta)

        q_xdist = deepcopy(self.xdist)
        # p_xdist = deepcopy(self.xdist)

        param = MatrixNormalParameters(self.dm_state, self.dm_act, self.nb_steps)
        p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param)

        regcost = self.parameter_dual_regularization(p_xdist, q_xdist, kappa)
        param, xvalue, diverge = self.regularized_parameter_backward_pass(beta, agcost, p_xdist, regcost)
        if diverge != -1:
            return np.nan, np.nan

        # conergence of inner loop
        xdist_kl = np.sum(self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma, q_xdist.mu, q_xdist.sigma))
        while np.any(xdist_kl > 1e-3):
            regcost = self.parameter_dual_regularization(p_xdist, q_xdist, kappa)
            param, xvalue, diverge = self.regularized_parameter_backward_pass(beta, agcost, p_xdist, regcost)
            if diverge != -1:
                return np.nan, np.nan

            q_xdist = deepcopy(p_xdist)
            p_xdist, _, _ = self.cubature_forward_pass(self.ctl, param)

            # check convergence of loop
            xdist_kl = self.gaussians_kldiv(p_xdist.mu, p_xdist.sigma,
                                            q_xdist.mu, q_xdist.sigma,
                                            self.dm_state, self.nb_steps + 1)

        # dual expectation
        dual = quad_expectation(q_xdist.mu[..., 0], q_xdist.sigma[..., 0],
                                xvalue.V[..., 0], xvalue.v[..., 0],
                                xvalue.v0[..., 0])

        dual += beta * (np.sum(self.parameter_kldiv(param)) - self.param_kl_bound)
        dual -= kappa * xdist_kl

        # dual gradient
        grad = np.sum(self.parameter_kldiv(param)) - self.param_kl_bound

        return -1. * np.array([dual]), -1. * np.array([grad])
Esempio n. 4
0
    def policy_dual(self, alpha):
        # augmented cost
        agcost = self.policy_augment_cost(alpha)

        # backward pass
        lgc, xvalue, xuvalue, diverge = self.policy_backward_pass(alpha, agcost)

        # forward pass
        xdist, udist, xudist = self.cubature_forward_pass(lgc, self.param)

        # dual expectation
        dual = quad_expectation(xdist.mu[..., 0], xdist.sigma[..., 0],
                                xvalue.V[..., 0], xvalue.v[..., 0],
                                xvalue.v0[..., 0])

        if self.kl_stepwise:
            dual = np.array([dual]) - np.sum(alpha * self.policy_kl_bound)
            grad = self.policy_kldiv(lgc, xdist) - self.policy_kl_bound
        else:
            dual = np.array([dual]) - alpha * self.policy_kl_bound
            grad = np.sum(self.policy_kldiv(lgc, xdist)) - self.policy_kl_bound

        return -1. * dual, -1. * grad