Exemple #1
0
 def choose_action(self, state):
     action2q = {a: self.q[(state, a)] for a in Action.get_actions()}
     if random.random() < self.epsilon:
         return random.choice(action2q.keys())
     max_q = max(action2q.values())
     best_actions = [a for a in action2q if action2q[a] == max_q]
     return random.choice(best_actions)
Exemple #2
0
 def learn_q(self, last_state, action, reward, now_state):
     max_state_value = max(
         [self.q[(now_state, a)] for a in Action.get_actions()])
     # 全部使用reward初始化
     if (last_state, action) not in self.q:
         self.q[(last_state, action)] = reward
     self.q[(
         last_state,
         action)] += self.alpha * (reward + self.gamma * max_state_value -
                                   self.q[(last_state, action)])