def minimax_decision(state, game): """Given a state in a game, calculate the best move by searching forward all the way to the terminal states. [Figure 5.3]""" player = game.to_move(state) def max_value(state): if game.terminal_test(state): return game.utility(state, player) v = -inf for a in game.actions(state): v = max(v, min_value(game.result(state, a))) return v def min_value(state): if game.terminal_test(state): return game.utility(state, player) v = inf for a in game.actions(state): v = min(v, max_value(game.result(state, a))) return v # Body of minimax_decision: return argmax(game.actions(state), key=lambda a: min_value(game.result(state, a)))
def best_policy(mdp, U): """Given an MDP and a utility function U, determine the best policy, as a mapping from state to action.""" pi = {} for s in mdp.states: pi[s] = argmax(mdp.actions(s), key=lambda a: q_value(mdp, s, a, U)) return pi
def execute(self, percept): """Execute the information gathering algorithm""" self.observation = self.integrate_percept(percept) vpis = self.vpi_cost_ratio(self.variables) j = argmax(vpis) variable = self.variables[j] if self.vpi(variable) > self.cost(variable): return self.request(variable) return self.decnet.best_action()
def policy_iteration(mdp): """Solve an MDP by policy iteration [Figure 17.7]""" U = {s: 0 for s in mdp.states} pi = {s: random.choice(mdp.actions(s)) for s in mdp.states} while True: U = policy_evaluation(pi, U, mdp) unchanged = True for s in mdp.states: a_star = argmax(mdp.actions(s), key=lambda a: q_value(mdp, s, a, U)) # a = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp)) if q_value(mdp, s, a_star, U) > q_value(mdp, s, pi[s], U): pi[s] = a_star unchanged = False if unchanged: return pi
def __call__(self, percept): s1, r1 = self.update_state(percept) Q, Nsa, s, a, r = self.Q, self.Nsa, self.s, self.a, self.r alpha, gamma, terminals = self.alpha, self.gamma, self.terminals, actions_in_state = self.actions_in_state if s in terminals: Q[s, None] = r1 if s is not None: Nsa[s, a] += 1 Q[s, a] += alpha(Nsa[s, a]) * ( r + gamma * max(Q[s1, a1] for a1 in actions_in_state(s1)) - Q[s, a]) if s in terminals: self.s = self.a = self.r = None else: self.s, self.r = s1, r1 self.a = argmax(actions_in_state(s1), key=lambda a1: self.f(Q[s1, a1], Nsa[s1, a1])) return self.a
def expectiminimax(state, game): """Return the best move for a player after dice are thrown. The game tree includes chance nodes along with min and max nodes. [Figure 5.11]""" player = game.to_move(state) def max_value(state): v = -inf for a in game.actions(state): v = max(v, chance_node(state, a)) return v def min_value(state): v = inf for a in game.actions(state): v = min(v, chance_node(state, a)) return v def chance_node(state, action): res_state = game.result(state, action) if game.terminal_test(res_state): return game.utility(res_state, player) sum_chances = 0 num_chances = len(game.chances(res_state)) for chance in game.chances(res_state): res_state = game.outcome(res_state, chance) util = 0 if res_state.to_move == player: util = max_value(res_state) else: util = min_value(res_state) sum_chances += util * game.probability(chance) return sum_chances / num_chances # Body of expectiminimax: return argmax(game.actions(state), key=lambda a: chance_node(state, a), default=None)
def program(percept): belief_state.observe(program.action, percept) program.action = argmax(belief_state.actions(), key=belief_state.expected_outcome_utility) return program.action