def select_best_move(self, stats, depth, board, color): """Select the best move at the end of the Monte Carlo tree search""" bestscore = 0 bestmove = None total_n = 0 for action in SP.available_actions(board): next_board = board[:] SP.play(next_board, action, color) n, w = stats[MCTSRandomPlayer.to_board_id(next_board)] if n == 0: continue total_n += n if self.DEBUG: print('Move %d score: %d/%d (%0.1f%%)' % (action, w, n, w / n * 100)) if n > bestscore or (n == bestscore and random.random() <= 0.5 ): # 가장 많이 방문해 본 길을 따라 간다. WHY??? bestmove = action bestscore = n assert bestmove is not None if self.DEBUG: print('Maximum depth: %d, Total simulations: %d on %d' % (depth, total_n, MCTSRandomPlayer.to_board_id(board))) return bestmove
def negamax(self, state, color, depth=10): ''' implement negamax algorithm https://en.wikipedia.org/wiki/Negamax ''' # negamax.counter += 1 # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small # LEAF NODE is checked on play time # Transposition Table related work (state) # _id = ob.board_id _id = OptimalBoard.board_to_id(state) cache = self.tp.get(_id) if cache is not None: # BUG FIX! cache can be 0, so should check None # case 1 # return cache # case 2 return cache[0], random.choice(cache[1]) # RECURSIVE actions = SP.available_actions(state) random.shuffle( actions) # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐 best_score = -math.inf best_actions = [] for action in actions: next_s = state[:] score, done = SP.play(next_s, action, color) if not done: score, _ = self.negamax(next_s, SP.next(color), depth - 1) score = -score # negamax # pick from all best moves if score > best_score: best_score = score best_actions = [action] elif score == best_score: best_actions.append(action) # case 1: choose random value 1 time # choosed_result = random.choice(best_scores) # tp.put(_id, choosed_result) # return choosed_result # case 2: choose random value every time self.tp.put(_id, (best_score, best_actions)) return (best_score, random.choice(best_actions))
def find_next(board, color, seq): actions = SP.available_actions(board) for action in actions: new_board = board[:] reward, done = SP.play(new_board, action, color) if done == True: # print it? if reward == 0: print(seq + str(action), '=', OB.board_to_id(new_board)) else: print(seq + str(action), MARKER[color], OB.board_to_id(new_board)) else: find_next(new_board, SP.next(color), seq + str(action))
def search(self, board, start_color, simulations, C): ''' implement monte carlo tree search algorithm ''' # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small # LEAF NODE is checked on play time stats = self._stats root = board max_depth = 0 for _ in range(simulations): node = root[:] states = [] # select leaf node depth = 0 done = False color = start_color while not done: depth += 1 action, select = self.select_next_move(stats, node, color, C) reward, done = SP.play(node, action, color) color = SP.next(color) states.append(MCTSRandomPlayer.to_board_id(node)) if not select: break max_depth = max(depth, max_depth) # run simulation if not at the end of the game tree if not done: result = self.simulate(node, start_color) # TODO: 여기를 어떻게 할지 else: if reward == 0: result = 0.5 else: result = 0 # propagate results for state in reversed(states): result = 1 - result stats[state][0] += 1 stats[state][1] += result return stats, max_depth
def simulate(self, board, start_color): # random simulator ( Light playouts ) node = board[:] done = False color = start_color while not done: actions = SP.available_actions(node) reward, done = SP.play(node, random.choice(actions), color) color = SP.next(color) if reward == 0: # TIE return 0.5 elif color == start_color: return 1 else: return 0
def select_next_move(self, stats, board, color, C): """Select the next state and consider if it should be expanded (UCT)""" bestscore = None bestmove = None # my_id = MCTSRandomPlayer.to_board_id(board) children = [] for action in SP.available_actions(board): # clone and play mode - can be play and rollback mode next_board = board[:] SP.play(next_board, action, color) children.append( (action, stats[MCTSRandomPlayer.to_board_id(next_board)])) total_n = sum(x[0] for (_, x) in children) for child_move, child_stat in children: n, w = child_stat if n == 0: # 한번도 안가봤으면 가보자! return child_move, False else: # 승률이 높고 (exploitation), 가장 적게 가본 곳이 좋은 곳 (exploration) score = (w / n) + C * math.sqrt(2 * math.log(total_n) / n) # if my_id == 70645: # print("CHECK IN ", my_id, child_move, w, n, score, bestscore, next_id) # if next_id == 119797: # print("JUMP IN ", my_id, child_move, w, n, score, bestscore, next_id) if bestscore is None or score > bestscore: bestscore = score bestmove = child_move # if my_id == 70645: # print("SELECTED", bestmove, bestscore) assert bestmove is not None return bestmove, True
def negamax_alpha_beta_pruning(self, state, color, alpha=-math.inf, beta=math.inf, depth=10): ''' implement negamax algorithm with alpha-beta purning https://en.wikipedia.org/wiki/Negamax ''' # negamax.counter += 1 # CHECK LEAF NODE / DO NOT NEED TO CHECK DEPTH = 0 BECASE TicTacToe is too small # LEAF NODE is checked on play time orig_alpha = alpha # Transposition Table related work # ob = OptimalBoard(state) # _id = ob.board_id _id = OptimalBoard.board_to_id(state) cache = self.tp.get(_id) if cache and cache['depth'] >= depth: (cached_score, cached_action) = cache['value'] if cache['flag'] == self.tp.EXACT: return (cached_score, cached_action) elif cache['flag'] == self.tp.LOWERBOUND: alpha = max(alpha, cached_score) elif cache['flag'] == self.tp.UPPERBOUND: beta = min(beta, cached_score) if alpha >= beta: return cached_score, cached_action # else: # print("MISS", t.seq) # RECURSIVE actions = SP.available_actions(state) random.shuffle( actions) # move orders를 쓰면, alpha beta pruning시에 성능이 좋아짐 best_score = -math.inf best_move = -1 for action in actions: next_s = state[:] score, done = SP.play(next_s, action, color) if not done: score, _ = self.negamax_alpha_beta_pruning(next_s, SP.next(color), alpha=-beta, beta=-alpha, depth=depth - 1) score = -score # negamax # just pick up 1 first best move (random.shuffle make randomness) if best_score < score or (score == best_score and random.random() < 0.5): best_score = score best_move = action if alpha < score: alpha = score # 결국 alpha = max(alpha, best_score) if alpha > beta: break if best_score <= orig_alpha: flag = self.tp.UPPERBOUND elif best_score >= beta: flag = self.tp.LOWERBOUND else: flag = self.tp.EXACT self.tp.put(key=_id, depth=depth, value=(best_score, best_move), flag=flag) return (alpha, best_move)