Beispiel #1
0
class RTErgodicControl(object):
    def __init__(self,
                 model,
                 target_dist,
                 weights=None,
                 horizon=100,
                 num_basis=5,
                 capacity=100000,
                 batch_size=20):

        self.model = model
        self.target_dist = target_dist
        self.horizon = horizon
        self.replay_buffer = ReplayBuffer(capacity)
        self.batch_size = batch_size

        self.basis = Basis(self.model.explr_space, num_basis=num_basis)
        #         self.lamk  = 1.0/(np.linalg.norm(self.basis.k, axis=1) + 1)**(3.0/2.0)
        self.lamk = np.exp(-0.8 * np.linalg.norm(self.basis.k, axis=1))
        self.barr = Barrier(self.model.explr_space)
        # initialize the control sequence
        # self.u_seq = [np.zeros(self.model.action_space.shape)
        #                 for _ in range(horizon)]
        self.u_seq = [
            0.0 * self.model.action_space.sample() for _ in range(horizon)
        ]
        if weights is None:
            weights = {'R': np.eye(self.model.action_space.shape[0])}

        self.Rinv = np.linalg.inv(weights['R'])

        self._phik = None
        self.ck = None

    def reset(self):
        self.u_seq = [
            0.0 * self.model.action_space.sample() for _ in range(self.horizon)
        ]
        self.replay_buffer.reset()

    @property
    def phik(self):
        return self._phik

    @phik.setter
    def phik(self, phik):
        assert len(
            phik
        ) == self.basis.tot_num_basis, 'phik does not have the same number as ck'
        self._phik = phik

    def __call__(self,
                 state,
                 ck_list=None,
                 agent_num=None,
                 seq=False,
                 turtle_mode=True):
        assert self.phik is not None, 'Forgot to set phik, use set_target_phik method'

        self.u_seq[:-1] = self.u_seq[1:]
        self.u_seq[-1] = np.zeros(self.model.action_space.shape)

        x = self.model.reset(state)

        pred_traj = []
        dfk = []
        fdx = []
        fdu = []
        dbar = []
        for t in range(self.horizon):
            # collect all the information that is needed
            pred_traj.append(x[self.model.explr_idx])
            dfk.append(self.basis.dfk(x[self.model.explr_idx]))
            fdx.append(self.model.fdx(x, self.u_seq[t]))
            fdu.append(self.model.fdu(x))
            dbar.append(self.barr.dx(x[self.model.explr_idx]))
            # step the model forwards
            x = self.model.step(self.u_seq[t] * 0.)

        # sample any past experiences
        # print('replay_buffer.shape: {}, batch_size.shape{}'.format(len(self.replay_buffer), self.batch_size))
        if len(self.replay_buffer) > self.batch_size:
            past_states = self.replay_buffer.buffer[-self.batch_size:]
            pred_traj = pred_traj + past_states
        else:
            past_states = self.replay_buffer.buffer[0:]
            pred_traj = pred_traj + past_states
        #     past_states = self.replay_buffer.sample(self.batch_size)
        #     pred_traj = pred_traj + past_states
        # else:
        #     past_states = self.replay_buffer.sample(len(self.replay_buffer))
        #     pred_traj = pred_traj + past_states

        # calculate the cks for the trajectory *** this is also in the utils file
        N = len(pred_traj)
        ck = np.sum([self.basis.fk(xt) for xt in pred_traj], axis=0) / N

        self.ck = ck.copy()
        if ck_list is not None:
            ck_list[agent_num] = ck
            ck = np.mean(ck_list, axis=0)

        fourier_diff = self.lamk * (ck - self.phik)
        fourier_diff = fourier_diff.reshape(-1, 1)

        # backwards pass
        rho = np.zeros(self.model.observation_space.shape)
        for t in reversed(range(self.horizon)):
            edx = np.zeros(self.model.observation_space.shape)
            edx[self.model.explr_idx] = np.sum(dfk[t] * fourier_diff, 0)

            bdx = np.zeros(self.model.observation_space.shape)
            bdx[self.model.explr_idx] = dbar[t]
            rho = rho - self.model.dt * (-edx - bdx - np.dot(fdx[t].T, rho))

            self.u_seq[t] = -np.dot(np.dot(self.Rinv, fdu[t].T), rho)

            if (np.abs(self.u_seq[t]) > 1.0).any():
                self.u_seq[t] /= np.linalg.norm(self.u_seq[t])

            if turtle_mode is True:
                self.u_seq[t][0] = np.clip(self.u_seq[t][0], -0.2, 0.2)
                self.u_seq[t][1] = np.clip(self.u_seq[t][1], -2.8, 2.8)

        self.replay_buffer.push(state[self.model.explr_idx])

        if seq is False:
            return self.u_seq[0].copy()
        else:
            return self.u_seq[0].copy(), self.u_seq.copy()
Beispiel #2
0
import numpy as np
import numpy.random as npr
from basis import Basis
from gym.spaces import Box

# define the exploration space as gym.Box
explr_space = Box(np.array([0.0, 0.0]), np.array([1.0, 1.0]), dtype=np.float32)
# define the basis object
basis = Basis(explr_space=explr_space, num_basis=5)
# simulate/randomize a trajectory
xt = [explr_space.sample() for _ in range(10)]
# print indices for all basis functions
print('indices for all basis functions: ')
print(basis.k)  # amount is square of num_basis
# test basis function, the input is a pose
print(basis.fk(xt[0]))
# test derivative of basis function wrt a pose
print(basis.dfk(xt[0]))
# hk, even computed in the source code, is not
# used in the end, so we temporarily ignore it

###########################################
# compute trajectory to ck using basis function
from utils import convert_traj2ck
ck = convert_traj2ck(basis, xt)
print('ck: ')
print(ck)

###########################################
# barrier function test
from barrier import Barrier
Beispiel #3
0
class DErgControl(object):

    def __init__(self, agent_name, model,
                    weights=None, t_horizon=10, num_basis=5,
                    capacity=100000, batch_size=20):

        self._agent_name     = agent_name
        self._model          = model
        self._t_horizon        = t_horizon
        self._replay_buffer = ReplayBuffer(capacity)
        self._batch_size    = batch_size

        self._basis = Basis(self._model.explr_space, num_basis=num_basis)
        self._lamk  = np.exp(-0.8*np.linalg.norm(self._basis.k, axis=1))
        self._barr  = Barrier(self._model.explr_space)

        self._targ_dist = TargetDist(basis=self._basis)

        self._u = [0.0*np.zeros(self._model.action_space.shape[0])
                        for _ in range(t_horizon)]
        # TODO: implement more weights
        if weights is None:
            weights = {'R' : np.eye(self._model.action_space.shape[0])}
        self._Rinv = np.linalg.inv(weights['R'])

        self._phik      = None
        self._ck_mean = None
        self._ck_msg    = Ck()
        self._ck_msg.name = self._agent_name
        self._ck_dict   = {}

        self.pred_path = []

        # set up the eos stuff last
        rospy.Subscriber('ck_link', Ck, self._ck_link_callback)
        self._ck_pub = rospy.Publisher('ck_link', Ck, queue_size=1)


    def _ck_link_callback(self, msg):
        if msg.name != self._agent_name:
            if self._ck_dict.has_key(msg.name):
                self._ck_dict[msg.name] = np.array(msg.ck)
            else:
                self._ck_dict.update({msg.name : np.array(msg.ck)})

    def reset(self):
        self._u = [0.0*np.zeros(self._model.action_space.shape[0])
                for _ in range(self._t_horizon)]
        self._replay_buffer.reset()


    def __call__(self, state):
        assert self._targ_dist.phik is not None, 'Forgot to set phik'

        if self._targ_dist.has_update==True: 
            self._replay_buffer.reset()
            self._targ_dist.has_update = False
            
        self._u[:-1] = self._u[1:]
        self._u[-1]  = np.zeros(self._model.action_space.shape[0])

        x = self._model.reset(state.copy())

        pred_traj = []
        dfk       = []
        fdx       = []
        fdu       = []
        dbar      = []
        for t in range(self._t_horizon):

            # collect all the information that is needed
            pred_traj.append(
                    x[self._model.explr_idx]
            )
            dfk.append(
                    self._basis.dfk(x[self._model.explr_idx])
            )

            dbar.append(
                    self._barr.dx(x[self._model.explr_idx])
            )
            # step the model forwards
            x = self._model.step(self._u[t])
            fdx.append(self._model.A)
            fdu.append(self._model.B)


        self.pred_path = deepcopy(pred_traj)
        # sample any past experiences
        if len(self._replay_buffer) > self._batch_size:
            past_states     = self._replay_buffer.sample(self._batch_size)
            pred_traj       = pred_traj + past_states
        else:
            past_states = self._replay_buffer.sample(len(self._replay_buffer))
            pred_traj   = pred_traj + past_states

        # calculate the cks for the trajectory
        # *** this is also in the utils file
        N = len(pred_traj)
        ck = np.sum([self._basis.fk(xt) for xt in pred_traj], axis=0) / N
        self._ck_msg.ck = ck.copy()
        self._ck_pub.publish(self._ck_msg)

        if len(self._ck_dict.keys()) > 1:
            self._ck_dict[self._agent_name] = ck
            cks = []
            for key in self._ck_dict.keys():
                cks.append(self._ck_dict[key])
            ck = np.mean(cks, axis=0)
            # print('sharing and make sure first ck is 0 ', ck[0])
        self._ck_mean = ck

        fourier_diff = self._lamk * (ck - self._targ_dist.phik)
        fourier_diff = fourier_diff.reshape(-1,1)


        # backwards pass
        rho = np.zeros(self._model.observation_space.shape[0])
        for t in reversed(range(self._t_horizon)):
            edx = np.zeros(self._model.observation_space.shape[0])
            edx[self._model.explr_idx] = np.sum(dfk[t] * fourier_diff, 0)

            bdx = np.zeros(self._model.observation_space.shape[0])
            bdx[self._model.explr_idx] = dbar[t]
            rho = rho - self._model.dt * (-edx-bdx-np.dot(fdx[t].T, rho))

            self._u[t] = -np.dot(np.dot(self._Rinv, fdu[t].T), rho)
            if (np.abs(self._u[t]) > 1.0).any():
                self._u[t] /= np.linalg.norm(self._u[t])
        self._replay_buffer.push(state[self._model.explr_idx].copy())
        return self._u[0].copy()