def speedy_minimax(state, depth, θ, searched_states=None, first=False, memoised_states=None): if state.training_terminal_test(): return state.utility(), searched_states if depth == 0: return H(Φ(state, memoised_states), θ), searched_states maxEval = -INF # set up multiprocessing sharing the memoised states dict, if first: with Manager() as m: d = m.dict(memoised_states) child = state.result(a) #child.board *= -1 children = [(child, depth - 1, θ, searched_states, False, d) for a in state.actions()] with Pool(PROCESSES) as p: results = (p.starmap(speedy_minimax, children)) memoised_states.update(dict(d)) evals = [res[0] for res in results] maxEval = max(evals) sets = [res[1] for res in results] for s in sets: searched_states.update(s) else: for a in state.actions(): child = state.result(a) #child.board *= -1 maxEval = max( maxEval, -speedy_minimax(child, depth - 1, θ, searched_states, memoised_states=memoised_states)[0]) if searched_states is not None: # Store the state, it's V(s) and H(s) features = Φ(state, memoised_states) searched_states.add( (state.__hash__(), maxEval, H(features, θ), features.tostring(), depth)) return maxEval, searched_states
def negamax(state, alpha, beta, depth, θ, memoised_states=None): if state.training_terminal_test(): return state.utility(train=True) if depth == 0: if memoised_states: return H(Φ(state, memoised_states), θ) return H(Φ(state), θ) v = -4 * INF for a in state.actions(): child = state.result(a) v = max(v, -negamax(child, -beta, -alpha, depth - 1, θ, memoised_states)) if v >= beta: return v alpha = max(alpha, v) return v
def minimax(state, depth, θ, searched_states=None): if state.stages_terminal_test(): return state.utility() if depth == 0: return H(Φ(state), θ) maxEval = -INF for a in state.actions(): child = state.result(a) #child.board *= -1 maxEval = max(maxEval, -minimax(child, depth - 1, θ, searched_states)) if searched_states is not None: # Store the state, it's V(s) and H(s) features = Φ(state) searched_states.append((state, maxEval, H(features, θ), features, depth)) return maxEval
def negamax(state, alpha, beta, depth, θ, memoised_states=None): if state.training_terminal_test(): #print(state) #print(state.utility()) #print(depth) return state.utility(train_end=True) if depth == 0: if memoised_states: return H(Φ(state, memoised_states), θ) return H(Φ(state), θ) v = -INF * 10 for a in state.actions(): child = state.result(a) nmax = -negamax(child, -beta, -alpha, depth - 1, θ, memoised_states) v = max(v, nmax) if depth == 1: pass #print(v, nmax, a) alpha = max(alpha, v) if alpha >= beta: return v return v
def min_value(state, alpha, beta, depth, θ, searched_states, memoised_features=None): if state.training_terminal_test(): return state.utility(train=True), -state.utility(train=True) v0, v1 = 4 * INF, -4 * INF # we assume whole alpha beta called with even depth, so this is the last min value call if depth == 1: v1 = H(Φ(state, memoised_features), θ) for a in state.actions(): child = state.result(a) pot_v0, pot_v1 = max_value(child, alpha, beta, depth - 1, θ, searched_states, memoised_features) v1 = max(v1, pot_v1) v0 = min(v0, pot_v0) if v0 <= alpha: if depth > 1: # update searched_states # v0 is a U_BOUND, v1 is a L_BOUND features = Φ(state, memoised_features) searched_states.append( (state, v1, L_BOUND, H(features, θ), features, depth)) return v0, v1 beta = min(beta, v0) if depth > 1: # update searched states # v0 is EXACT, v1 is EXACT features = Φ(state, memoised_features) searched_states.append((state, v1, EXACT, H(features, θ), features, depth)) return v0, v1
def negamax(self, state, alpha, beta, depth, θ): if state.terminal_test(): return state.utility() if depth == 0: return H(Φ(state), θ) v = -INF for a in state.actions(): child = state.result(a) v = max(v, -1 * (self.negamax(child, -beta, -alpha, depth - 1, θ))) if v >= beta: return v alpha = max(alpha, v) return v
def max_value(state, alpha, beta, depth, θ, searched_states, memoised_features=None): if state.training_terminal_test(): return state.utility(train=True), -state.utility(train=True) if depth == 0: v0 = H(Φ(state, memoised_features), θ) return v0, -INF * 4 v0, v1 = -4 * INF, 4 * INF for a in state.actions(): child = state.result(a) pot_v0, pot_v1 = min_value(child, alpha, beta, depth - 1, θ, searched_states, memoised_features) v1 = min(v1, pot_v1) v0 = max(v0, pot_v0) if v0 >= beta: if depth > 1: # update searched states # v0 is a L_BOUND, v1 is a U_BOUND features = Φ(state, memoised_features) searched_states.append( (state, v0, L_BOUND, H(features, θ), features, depth)) return v0, v1 alpha = max(alpha, v0) if depth > 1: # update searched states # v0 is EXACT, v1 is EXACT features = Φ(state, memoised_features) searched_states.append((state, v0, EXACT, H(features, θ), features, depth)) return v0, v1
def tree_strap_train(θo, θd, θm, θe, depth=TRAIN_DEPTH): state = State() #memoised_features = {} if MULTI else None memoised_features = {} random_turns = np.random.choice([0] * 0 + [2] * 0 + [6] * 2 + [8] * 4 + [16] * 4 + [32] * 8) # See if each player will use book X_use_book = np.random.choice([0, 0, 0, 1]) O_use_book = np.random.choice([0, 0, 0, 1]) while (not state.training_terminal_test()): print(f'Turn number {state.turn}') print(state) print() if state.stage[0] == OPN: θ = θo elif state.stage[0] == DEV: θ = θd elif state.stage[0] == MID: θ = θm else: θ = θe #depth = 2*TRAIN_DEPTH if ((state.turn % 2 and X_use_book) or (not state.turn % 2 and O_use_book)) and (str(state.board) in opening_book): state = state.result(tuple(opening_book[str(state.board)])) elif state.turn < random_turns: num_actions = len(state.actions(False)) state = state.result( state.actions(False)[np.random.choice( [i for i in range(num_actions)])]) else: if MULTI: searched_states = set() V = speedy_minimax(state, depth, θ, searched_states, first=True, memoised_states=memoised_features)[0] elif not AB_TRAIN: searched_states = [] V = negamax(state, -10 * INF, 10 * INF, depth, θ, memoised_features) if AB_TRAIN: searched_states = [] alpha_beta_train(state, θ, searched_states, TRAIN_DEPTH, memoised_features) ab_weight_updates(searched_states, θ, depth, α, λ, MAX_CHANGE) else: Δθ = np.zeros(num_features) #for s, vs, hs, features, d in searched_states: # # updates should only happen for states that match the player to play # if not d % 2: # features = np.frombuffer(features) # #𝛿 = V(s) - H(features, θ) # 𝛿 = vs - hs # Δθ += α*𝛿*features*λ**(depth-d) if V != 0: features = Φ(state, memoised_features) h = H(features, θ) 𝛿 = V - h Δθ += α * 𝛿 * features for i in range(num_features): if Δθ[i] > MAX_CHANGE: Δθ[i] = MAX_CHANGE elif Δθ[i] < -MAX_CHANGE: Δθ[i] = -MAX_CHANGE θ += Δθ best_action = None alpha, beta, v = -4 * INF, 4 * INF, -4 * INF for a in state.actions(): child = state.result(a) nmax = -negamax(child, -beta, -alpha, depth - 1, θ, memoised_features) if nmax > alpha: alpha = nmax best_action = a state = state.result(best_action) print(alpha) print('Terminal State:') print(state) memoised_features = None gc.collect() return θo, θd, θm, θe