def forward(self, b, ox, a, box):
        I = torch.eye(5)

        # Q matrix
        Q = torch.zeros(5, 5)
        Q[-2:, -2:] = torch.diag(torch.exp(
            self.pro_noise_ln_vars))  # variance of vel, ang_vel

        # R matrix
        R = torch.diag(torch.exp(self.obs_noise_ln_vars))

        # H matrix
        H = torch.zeros(2, 5)
        H[:, -2:] = torch.diag(self.obs_gains)

        # Extended Kalman Filter
        pre_bx_, P = b
        bx_ = dynamics(pre_bx_, a.view(-1), self.dt, box, self.pro_gains,
                       self.pro_noise_ln_vars)
        bx_ = bx_.t()  # make a column vector
        A = self.A(bx_)  # after dynamics
        P_ = A.mm(P).mm(A.t()) + Q  # P_ = APA^T+Q
        if not is_pos_def(P_):
            print("P_:", P_)
            print("P:", P)
            print("A:", A)
            APA = A.mm(P).mm(A.t())
            print("APA:", APA)
            print("APA +:", is_pos_def(APA))
        error = ox - self.observations(bx_)
        #error = ox - self.observations_mean(bx_)

        S = H.mm(P_).mm(H.t()) + R  # S = HPH^T+R
        K = P_.mm(H.t()).mm(torch.inverse(S))  # K = PHS^-1
        bx = bx_ + K.matmul(error)
        I_KH = I - K.mm(H)
        P = I_KH.mm(P_)

        if not is_pos_def(P):
            print("here")
            print("P:", P)
            P = (
                P + P.t()
            ) / 2 + 1e-6 * I  # make symmetric to avoid computational overflows

        bx = bx.t()  #return to a row vector
        b = bx.view(-1), P  # belief

        # terminal check
        terminal = self._isTerminal(bx, a)  # check the monkey stops or not
        return b, {'stop': terminal}
Beispiel #2
0
def rewardFunc(rew_std, x, P, scale):
    mu = x[:2]  # pos
    R = torch.eye(2) * rew_std**2 # reward function is gaussian
    P = P[:2, :2] # cov
    S = R+P
    if not is_pos_def(S):
        print('R+P is not positive definite!')
    alpha = -0.5 * mu @ S.inverse() @ mu.t()
    #alpha = -0.5 * mu.matmul(torch.inverse(R+P)).matmul(mu.t())
    reward = torch.exp(alpha) /2 / np.pi /torch.sqrt(S.det())

    # normalization -> to make max reward as 1
    mu_zero = torch.zeros(1,2)
    alpha_zero = -0.5 * mu_zero @ R.inverse() @ mu_zero.t()
    reward_zero = torch.exp(alpha_zero) /2 / np.pi /torch.sqrt(R.det())
    reward = reward/reward_zero
    ####################

    reward = scale * reward  # adjustment for reward per timestep
    if reward > scale:
        print('reward is wrong!', reward)
        print('mu', mu)
        print('P', P)
        print('R', R)
    return reward.view(-1)