コード例 #1
0
 def _update_Q(self, experience_tuple, space):
     """
     Update Q table
     :param experience_tuple: s, a, s', r
     :return:
     """
     if not self.training_mode:
         return 0
     s, a, sp, r = experience_tuple
     prev_Q = self.Q[discretize_state(s, space), a]
     updated_Q = prev_Q + self.alpha * (
         r + self.gamma * self.Q[discretize_state(sp, space),
                                 np.argmax([
                                     self.Q[discretize_state(sp, space), i]
                                     for i in range(self.na)
                                 ])] - prev_Q)
     self.Q[discretize_state(s, space), a] = updated_Q
     self.alpha *= self.alpha_decay_rate
     return abs(updated_Q - prev_Q)
コード例 #2
0
    def _query(self, s, a, sp, r, space):
        """
        Select action and update Q-table
        :param s: previous state
        :param a: selected action
        :param sp: new state
        :param r: immediate reward
        :return:
        """
        delta_Q = self._update_Q((s, a, sp, r), space)

        self.total_actions += 1

        # Dyna-Q
        if self.dyna > 0:

            # Replace T and R models with in-memory historical data
            self.memory.append((self.s, self.a, sp, r))

            # Hallucinate
            for d in range(self.dyna):
                # Update Q-table
                self._update_Q(self.memory[np.random.choice(len(self.memory))],
                               space)

        if np.random.random() < self.epsilon:
            self.random_actions += 1
            action = np.random.choice(self.na)
            self.epsilon *= self.epsilon_decay_rate
        else:
            action = np.argmax([
                self.Q[discretize_state(sp, space), a] for a in range(self.na)
            ])

        # Update current state and action
        self.s = sp
        self.a = action

        return action, delta_Q
コード例 #3
0
    def _query_initial(self, s, space):
        """
        Select action without updating the Q-table
        :param s:
        :return:
        """
        self.total_actions += 1
        if np.random.random() < self.epsilon:
            self.random_actions += 1
            action = np.random.choice(self.na)
        else:
            action = np.argmax([
                self.Q[discretize_state(s, space), a] for a in range(self.na)
            ])

        self.v *= self.epsilon_decay_rate

        # Update current state and action
        self.s = s
        self.a = action

        return action