def forward(self, b, ox, a, box): I = torch.eye(5) # Q matrix Q = torch.zeros(5, 5) Q[-2:, -2:] = torch.diag(torch.exp( self.pro_noise_ln_vars)) # variance of vel, ang_vel # R matrix R = torch.diag(torch.exp(self.obs_noise_ln_vars)) # H matrix H = torch.zeros(2, 5) H[:, -2:] = torch.diag(self.obs_gains) # Extended Kalman Filter pre_bx_, P = b bx_ = dynamics(pre_bx_, a.view(-1), self.dt, box, self.pro_gains, self.pro_noise_ln_vars) bx_ = bx_.t() # make a column vector A = self.A(bx_) # after dynamics P_ = A.mm(P).mm(A.t()) + Q # P_ = APA^T+Q if not is_pos_def(P_): print("P_:", P_) print("P:", P) print("A:", A) APA = A.mm(P).mm(A.t()) print("APA:", APA) print("APA +:", is_pos_def(APA)) error = ox - self.observations(bx_) #error = ox - self.observations_mean(bx_) S = H.mm(P_).mm(H.t()) + R # S = HPH^T+R K = P_.mm(H.t()).mm(torch.inverse(S)) # K = PHS^-1 bx = bx_ + K.matmul(error) I_KH = I - K.mm(H) P = I_KH.mm(P_) if not is_pos_def(P): print("here") print("P:", P) P = ( P + P.t() ) / 2 + 1e-6 * I # make symmetric to avoid computational overflows bx = bx.t() #return to a row vector b = bx.view(-1), P # belief # terminal check terminal = self._isTerminal(bx, a) # check the monkey stops or not return b, {'stop': terminal}
def rewardFunc(rew_std, x, P, scale): mu = x[:2] # pos R = torch.eye(2) * rew_std**2 # reward function is gaussian P = P[:2, :2] # cov S = R+P if not is_pos_def(S): print('R+P is not positive definite!') alpha = -0.5 * mu @ S.inverse() @ mu.t() #alpha = -0.5 * mu.matmul(torch.inverse(R+P)).matmul(mu.t()) reward = torch.exp(alpha) /2 / np.pi /torch.sqrt(S.det()) # normalization -> to make max reward as 1 mu_zero = torch.zeros(1,2) alpha_zero = -0.5 * mu_zero @ R.inverse() @ mu_zero.t() reward_zero = torch.exp(alpha_zero) /2 / np.pi /torch.sqrt(R.det()) reward = reward/reward_zero #################### reward = scale * reward # adjustment for reward per timestep if reward > scale: print('reward is wrong!', reward) print('mu', mu) print('P', P) print('R', R) return reward.view(-1)