def update_chain(self, state, action, reward): lhstate = listhash(state) if not lhstate in self.q: self.q[lhstate] = dict() if not action in self.q[lhstate]: self.q[lhstate][action] = reward else: val = self.q[lhstate][action] self.q[lhstate][action] = val + self.learning_rate * \ (reward - self.discount * val)
def get_greedy_action(self, available_actions): # Do a tree search in the previously seen states # that match the current state best_action = None best_value = None for state in sublists(self.chain): lhstate = listhash(state) if lhstate in self.q: s = self.q[lhstate] for a in available_actions: if a in s: val = s[a] if val > best_value: best_action = a best_value = val return best_action