class RTErgodicControl(object): def __init__(self, model, target_dist, weights=None, horizon=100, num_basis=5, capacity=100000, batch_size=20): self.model = model self.target_dist = target_dist self.horizon = horizon self.replay_buffer = ReplayBuffer(capacity) self.batch_size = batch_size self.basis = Basis(self.model.explr_space, num_basis=num_basis) # self.lamk = 1.0/(np.linalg.norm(self.basis.k, axis=1) + 1)**(3.0/2.0) self.lamk = np.exp(-0.8 * np.linalg.norm(self.basis.k, axis=1)) self.barr = Barrier(self.model.explr_space) # initialize the control sequence # self.u_seq = [np.zeros(self.model.action_space.shape) # for _ in range(horizon)] self.u_seq = [ 0.0 * self.model.action_space.sample() for _ in range(horizon) ] if weights is None: weights = {'R': np.eye(self.model.action_space.shape[0])} self.Rinv = np.linalg.inv(weights['R']) self._phik = None self.ck = None def reset(self): self.u_seq = [ 0.0 * self.model.action_space.sample() for _ in range(self.horizon) ] self.replay_buffer.reset() @property def phik(self): return self._phik @phik.setter def phik(self, phik): assert len( phik ) == self.basis.tot_num_basis, 'phik does not have the same number as ck' self._phik = phik def __call__(self, state, ck_list=None, agent_num=None, seq=False, turtle_mode=True): assert self.phik is not None, 'Forgot to set phik, use set_target_phik method' self.u_seq[:-1] = self.u_seq[1:] self.u_seq[-1] = np.zeros(self.model.action_space.shape) x = self.model.reset(state) pred_traj = [] dfk = [] fdx = [] fdu = [] dbar = [] for t in range(self.horizon): # collect all the information that is needed pred_traj.append(x[self.model.explr_idx]) dfk.append(self.basis.dfk(x[self.model.explr_idx])) fdx.append(self.model.fdx(x, self.u_seq[t])) fdu.append(self.model.fdu(x)) dbar.append(self.barr.dx(x[self.model.explr_idx])) # step the model forwards x = self.model.step(self.u_seq[t] * 0.) # sample any past experiences # print('replay_buffer.shape: {}, batch_size.shape{}'.format(len(self.replay_buffer), self.batch_size)) if len(self.replay_buffer) > self.batch_size: past_states = self.replay_buffer.buffer[-self.batch_size:] pred_traj = pred_traj + past_states else: past_states = self.replay_buffer.buffer[0:] pred_traj = pred_traj + past_states # past_states = self.replay_buffer.sample(self.batch_size) # pred_traj = pred_traj + past_states # else: # past_states = self.replay_buffer.sample(len(self.replay_buffer)) # pred_traj = pred_traj + past_states # calculate the cks for the trajectory *** this is also in the utils file N = len(pred_traj) ck = np.sum([self.basis.fk(xt) for xt in pred_traj], axis=0) / N self.ck = ck.copy() if ck_list is not None: ck_list[agent_num] = ck ck = np.mean(ck_list, axis=0) fourier_diff = self.lamk * (ck - self.phik) fourier_diff = fourier_diff.reshape(-1, 1) # backwards pass rho = np.zeros(self.model.observation_space.shape) for t in reversed(range(self.horizon)): edx = np.zeros(self.model.observation_space.shape) edx[self.model.explr_idx] = np.sum(dfk[t] * fourier_diff, 0) bdx = np.zeros(self.model.observation_space.shape) bdx[self.model.explr_idx] = dbar[t] rho = rho - self.model.dt * (-edx - bdx - np.dot(fdx[t].T, rho)) self.u_seq[t] = -np.dot(np.dot(self.Rinv, fdu[t].T), rho) if (np.abs(self.u_seq[t]) > 1.0).any(): self.u_seq[t] /= np.linalg.norm(self.u_seq[t]) if turtle_mode is True: self.u_seq[t][0] = np.clip(self.u_seq[t][0], -0.2, 0.2) self.u_seq[t][1] = np.clip(self.u_seq[t][1], -2.8, 2.8) self.replay_buffer.push(state[self.model.explr_idx]) if seq is False: return self.u_seq[0].copy() else: return self.u_seq[0].copy(), self.u_seq.copy()
import numpy as np import numpy.random as npr from basis import Basis from gym.spaces import Box # define the exploration space as gym.Box explr_space = Box(np.array([0.0, 0.0]), np.array([1.0, 1.0]), dtype=np.float32) # define the basis object basis = Basis(explr_space=explr_space, num_basis=5) # simulate/randomize a trajectory xt = [explr_space.sample() for _ in range(10)] # print indices for all basis functions print('indices for all basis functions: ') print(basis.k) # amount is square of num_basis # test basis function, the input is a pose print(basis.fk(xt[0])) # test derivative of basis function wrt a pose print(basis.dfk(xt[0])) # hk, even computed in the source code, is not # used in the end, so we temporarily ignore it ########################################### # compute trajectory to ck using basis function from utils import convert_traj2ck ck = convert_traj2ck(basis, xt) print('ck: ') print(ck) ########################################### # barrier function test from barrier import Barrier
class DErgControl(object): def __init__(self, agent_name, model, weights=None, t_horizon=10, num_basis=5, capacity=100000, batch_size=20): self._agent_name = agent_name self._model = model self._t_horizon = t_horizon self._replay_buffer = ReplayBuffer(capacity) self._batch_size = batch_size self._basis = Basis(self._model.explr_space, num_basis=num_basis) self._lamk = np.exp(-0.8*np.linalg.norm(self._basis.k, axis=1)) self._barr = Barrier(self._model.explr_space) self._targ_dist = TargetDist(basis=self._basis) self._u = [0.0*np.zeros(self._model.action_space.shape[0]) for _ in range(t_horizon)] # TODO: implement more weights if weights is None: weights = {'R' : np.eye(self._model.action_space.shape[0])} self._Rinv = np.linalg.inv(weights['R']) self._phik = None self._ck_mean = None self._ck_msg = Ck() self._ck_msg.name = self._agent_name self._ck_dict = {} self.pred_path = [] # set up the eos stuff last rospy.Subscriber('ck_link', Ck, self._ck_link_callback) self._ck_pub = rospy.Publisher('ck_link', Ck, queue_size=1) def _ck_link_callback(self, msg): if msg.name != self._agent_name: if self._ck_dict.has_key(msg.name): self._ck_dict[msg.name] = np.array(msg.ck) else: self._ck_dict.update({msg.name : np.array(msg.ck)}) def reset(self): self._u = [0.0*np.zeros(self._model.action_space.shape[0]) for _ in range(self._t_horizon)] self._replay_buffer.reset() def __call__(self, state): assert self._targ_dist.phik is not None, 'Forgot to set phik' if self._targ_dist.has_update==True: self._replay_buffer.reset() self._targ_dist.has_update = False self._u[:-1] = self._u[1:] self._u[-1] = np.zeros(self._model.action_space.shape[0]) x = self._model.reset(state.copy()) pred_traj = [] dfk = [] fdx = [] fdu = [] dbar = [] for t in range(self._t_horizon): # collect all the information that is needed pred_traj.append( x[self._model.explr_idx] ) dfk.append( self._basis.dfk(x[self._model.explr_idx]) ) dbar.append( self._barr.dx(x[self._model.explr_idx]) ) # step the model forwards x = self._model.step(self._u[t]) fdx.append(self._model.A) fdu.append(self._model.B) self.pred_path = deepcopy(pred_traj) # sample any past experiences if len(self._replay_buffer) > self._batch_size: past_states = self._replay_buffer.sample(self._batch_size) pred_traj = pred_traj + past_states else: past_states = self._replay_buffer.sample(len(self._replay_buffer)) pred_traj = pred_traj + past_states # calculate the cks for the trajectory # *** this is also in the utils file N = len(pred_traj) ck = np.sum([self._basis.fk(xt) for xt in pred_traj], axis=0) / N self._ck_msg.ck = ck.copy() self._ck_pub.publish(self._ck_msg) if len(self._ck_dict.keys()) > 1: self._ck_dict[self._agent_name] = ck cks = [] for key in self._ck_dict.keys(): cks.append(self._ck_dict[key]) ck = np.mean(cks, axis=0) # print('sharing and make sure first ck is 0 ', ck[0]) self._ck_mean = ck fourier_diff = self._lamk * (ck - self._targ_dist.phik) fourier_diff = fourier_diff.reshape(-1,1) # backwards pass rho = np.zeros(self._model.observation_space.shape[0]) for t in reversed(range(self._t_horizon)): edx = np.zeros(self._model.observation_space.shape[0]) edx[self._model.explr_idx] = np.sum(dfk[t] * fourier_diff, 0) bdx = np.zeros(self._model.observation_space.shape[0]) bdx[self._model.explr_idx] = dbar[t] rho = rho - self._model.dt * (-edx-bdx-np.dot(fdx[t].T, rho)) self._u[t] = -np.dot(np.dot(self._Rinv, fdu[t].T), rho) if (np.abs(self._u[t]) > 1.0).any(): self._u[t] /= np.linalg.norm(self._u[t]) self._replay_buffer.push(state[self._model.explr_idx].copy()) return self._u[0].copy()