def tree_strap_train(θo, θd, θm, θe, depth=TRAIN_DEPTH): state = State() #memoised_features = {} if MULTI else None memoised_features = {} random_turns = np.random.choice([0] * 0 + [2] * 0 + [6] * 2 + [8] * 4 + [16] * 4 + [32] * 8) # See if each player will use book X_use_book = np.random.choice([0, 0, 0, 1]) O_use_book = np.random.choice([0, 0, 0, 1]) while (not state.training_terminal_test()): print(f'Turn number {state.turn}') print(state) print() if state.stage[0] == OPN: θ = θo elif state.stage[0] == DEV: θ = θd elif state.stage[0] == MID: θ = θm else: θ = θe #depth = 2*TRAIN_DEPTH if ((state.turn % 2 and X_use_book) or (not state.turn % 2 and O_use_book)) and (str(state.board) in opening_book): state = state.result(tuple(opening_book[str(state.board)])) elif state.turn < random_turns: num_actions = len(state.actions(False)) state = state.result( state.actions(False)[np.random.choice( [i for i in range(num_actions)])]) else: if MULTI: searched_states = set() V = speedy_minimax(state, depth, θ, searched_states, first=True, memoised_states=memoised_features)[0] elif not AB_TRAIN: searched_states = [] V = negamax(state, -10 * INF, 10 * INF, depth, θ, memoised_features) if AB_TRAIN: searched_states = [] alpha_beta_train(state, θ, searched_states, TRAIN_DEPTH, memoised_features) ab_weight_updates(searched_states, θ, depth, α, λ, MAX_CHANGE) else: Δθ = np.zeros(num_features) #for s, vs, hs, features, d in searched_states: # # updates should only happen for states that match the player to play # if not d % 2: # features = np.frombuffer(features) # #𝛿 = V(s) - H(features, θ) # 𝛿 = vs - hs # Δθ += α*𝛿*features*λ**(depth-d) if V != 0: features = Φ(state, memoised_features) h = H(features, θ) 𝛿 = V - h Δθ += α * 𝛿 * features for i in range(num_features): if Δθ[i] > MAX_CHANGE: Δθ[i] = MAX_CHANGE elif Δθ[i] < -MAX_CHANGE: Δθ[i] = -MAX_CHANGE θ += Δθ best_action = None alpha, beta, v = -4 * INF, 4 * INF, -4 * INF for a in state.actions(): child = state.result(a) nmax = -negamax(child, -beta, -alpha, depth - 1, θ, memoised_features) if nmax > alpha: alpha = nmax best_action = a state = state.result(best_action) print(alpha) print('Terminal State:') print(state) memoised_features = None gc.collect() return θo, θd, θm, θe
def play(θo, θm, θe, depth=TRAIN_DEPTH): OPN, MID, END = 0, 1, 2 state = State() first = np.random.choice([0, 1]) random_turns = 0 #np.random.choice([0] + [2]*2 + [4]*4 + [8]*8 + 16*[16] + 32*[32]) while (not state.terminal_test()): print(f'Turn number {state.turn}') print_board(state.board) print() if (state.turn + first) % 2: if state.board[state.board > 0].sum() == 12: θ = θo elif state.board[state.board > 0].sum() > 5: θ = θm else: θ = θe state.history[state] += 1 if state.turn < random_turns: num_actions = len(state.actions(False)) state = state.result( state.actions(False)[np.random.choice( [i for i in range(num_actions)])]) else: searched_states = [] V = minimax(State(state.board), depth, θ, searched_states) Δθ = np.zeros(num_features) for s, vs, hs, features, d in searched_states: #𝛿 = V(s) - H(features, θ) 𝛿 = vs - hs Δθ += α * 𝛿 * features * λ**(depth - d) for i in range(num_features): if Δθ[i] > MAX_CHANGE: Δθ[i] = MAX_CHANGE elif Δθ[i] < -MAX_CHANGE: Δθ[i] = -MAX_CHANGE θ += Δθ actions = [] actions2 = [] for a in state.actions(): child = state.result(a) actions.append((-negamax(State(-1 * child.board), -INF, INF, depth - 1, θ), a)) state = state.result(max(actions)[1]) else: print(actions_with_indices(translate_actions(state.actions()))) i = int(input()) state = state.result(state.actions()[i]) state.board *= -1 state.turn += 1 print(state) print('Game over!') return θo, θm, θe
while (not state.terminal_test()): print(f'Turn number {state.turn}') print(state) print() if state.board[state.board > 0].sum() == 12: θ = θo elif state.board[state.board > 0].sum() > 5: θ = θm else: θ = θe state.history[state] += 1 if state.turn < random_turns: num_actions = len(state.actions(False)) state = state.result(state.actions(False)[np.random.choice([i for i in range(num_actions)])]) else: searched_states = [] V = minimax(State(state.board), depth, θ, searched_states) Δθ = np.zeros(num_features) for s, vs, hs, features, d in searched_states: #� = V(s) - H(features, θ) � = vs - hs Δθ += α*�*features*λ**(depth-d) #s.board *= -1 #flipped_features = Φ(s) #� = -(vs - hs) THIS IS ALL WRONG BTW, RECALCULATE V AND H #Δθ += α*�*flipped_features*λ**(depth-d)