Beispiel #1
0
 def step(self, a):
     transitions = self.P[self.s][a]
     i = categorical_sample([t[0] for t in transitions], self.np_random)
     p, s, r, d = transitions[i]
     self.s = s
     self.lastaction = a
     return (int(s), r, d, {"prob": p})
Beispiel #2
0
 def reset(
     self,
     *,
     seed: Optional[int] = None,
     return_info: bool = False,
     options: Optional[dict] = None,
 ):
     super().reset(seed=seed)
     self.s = categorical_sample(self.isd, self.np_random)
     self.lastaction = None
     return int(self.s)
Beispiel #3
0
 def reset(self,
           *,
           seed: Optional[int] = None,
           return_info: bool = False,
           options: Optional[dict] = None):
     super().reset(seed=seed)
     self.s = categorical_sample(self.initial_state_distrib, self.np_random)
     self.lastaction = None
     if not return_info:
         return int(self.s)
     else:
         return int(self.s), {"prob": 1}
Beispiel #4
0
    def __init__(self, n_states=7, p_stay=0.0, p_backward=0.5):

        # two terminal states added
        self.shape = (1, n_states + 2)
        self.start_state_index = self.shape[1] // 2

        self.nS = nS = np.prod(self.shape)
        self.nA = nA = 2

        self.P = {}
        for s in range(nS):
            self.P[s] = {}
            for a in range(nA):
                p_forward = 1.0 - p_stay - p_backward

                s_forward = np.clip(s - 1 if a == WEST else s + 1, 0, nS -
                                    1) if s != 0 and s != nS - 1 else s
                s_backward = np.clip(s + 1 if a == WEST else s - 1, 0, nS -
                                     1) if s != 0 and s != nS - 1 else s

                r_forward = 1.0 if s == nS - 2 and s_forward == nS - 1 else 0.0
                r_backward = 1.0 if s == nS - 2 and s_backward == nS - 1 else 0.0

                d_forward = s >= nS - 2 and s_forward == nS - 1 or s <= 1 and s_forward == 0
                d_backward = s >= nS - 2 and s_backward == nS - 1 or s <= 1 and s_backward == 0

                self.P[s][a] = [(p_forward, s_forward, r_forward, d_forward),
                                (p_stay, s, 0.0, s == nS - 1 or s == 0),
                                (p_backward, s_backward, r_backward,
                                 d_backward)]

        self.isd = np.zeros(nS)
        self.isd[self.start_state_index] = 1.0
        self.lastaction = None  # for rendering

        self.action_space = spaces.Discrete(self.nA)
        self.observation_space = spaces.Discrete(self.nS)

        self.s = categorical_sample(self.isd, self.np_random)