def _calc_policy(self, own, enemy): env = OthelloEnv().update(own, enemy, Stone.black) node = create_node(env) # if turn < 4 if env.epoch < self.play_config.change_tau_turn: return self.__calc_policy_by_prob(node) # p value else: return self.__calc_policy_by_max(node)
def __init__(self, config: Config): self.config = config self.env = OthelloEnv().reset() self.ai = OthelloPlayer(self.config, _load_model(self.config), weight_table=WEIGHT_TABLE / 3, c=20, mc=True) # type: OthelloPlayer self.human_stone = None self.rev_function = None self.count_one_step = 0 self.count_all_step = 0 self.last_evaluation = None self.last_history = None # type: LastAcNQ self.last_ava = None self.action = None
async def __start_search_my_move(self, own, enemy): # set parmas self.running_simulation_num += 1 # wait sems with await self.sem: # 8綫程 env = OthelloEnv().update(own, enemy, Stone.black) leaf_v = await self.___recursive_simulation(env, is_root_node=True) self.running_simulation_num -= 1 return leaf_v
def solve(self, black, white, next_to_play, exactly=False): # set stuff self.start_time = time() if not self.last_is_exactly and exactly: # exactly时候要注意去掉上次的cache self.cache = {} self.last_is_exactly = exactly # try searching process try: move, score = self._find_winning_move_and_score( OthelloEnv().update(black, white, next_to_play), exactly=exactly) return move, score if next_to_play == Stone.black else -score except Timeout: return None, None
def think_and_play(self, own, enemy): """play tmd:方案:50步以前使用深度學習mctree,若tree到達50步深度后再用minmaxtree; 50步以後直接用minmaxtree 若搜不到/超時再用之前構建的樹""" # renew env self.start_time = time.time() env = OthelloEnv().update(own, enemy, next_to_play=Stone.black) node = create_node(env) #五十步之后直接minmax树搜索,若搜索不到,再用深度學習 if env.epoch >= self.play_config.use_solver_turn: logger.warning(f"Entering minmax_tree process") ret = self._solver(node) if ret: # not save move as play data return ret else: # 五十步之前直接用深度學習 for t1 in range(self.play_config.thinking_loop ): # search moves for 3 times logger.warning(f"Entering {t1} thinking_loop") self._expand_tree(env, node) policy, action, value_diff = self._calc_policy_and_action(node) # if action 足够大 + n足够大 \ turn 很小 if env.epoch <= self.play_config.start_rethinking_turn or \ (value_diff > -0.01 and self.num_tree[node][action] >= self.play_config.required_visit_to_decide_action): break # record or return if self.mode == 'gui': self._update_thinking_history(own, enemy, action, policy) self._update_avalable(own, enemy, action, policy) elif self.mode == 'self_play': if self.allow_resign: # resign win_rate 太小没有胜率。 if self.play_config.resign_threshold is not None and\ np.max(self.win_rate(node)-(self.num_tree[node]==0)*10) <= self.play_config.resign_threshold: if env.epoch >= self.config.play.allowed_resign_turn: return AcNQ(None, 0, 0) # means resign else: logger.debug( f"Want to resign but disallowed turn {env.epoch} < {self.config.play.allowed_resign_turn}" ) # save fuckers saved_policy = self.__calc_policy_by_prob( node ) if self.config.play_data.save_policy_of_tau_1 else policy self.__save_data_to_moves(own, enemy, saved_policy) return AcNQ(action=action, n=self.num_tree[node][action], q=self.win_rate(node)[action])
def _find_winning_move_and_score(self, env: OthelloEnv, exactly=True): # end if env.done: b, w = env.chessboard.black_white return None, b - w # restored key = black, white, next_to_play = env.chessboard.black, env.chessboard.white, env.next_to_play if key in self.cache: # store leaf node return self.cache[key] # timeout if time() - self.start_time > self.timeout: logger.debug("timeout!") raise Timeout() # recursive legal_moves = find_correct_moves( *(white, black) if not next_to_play == Stone.black else (black, white)) action_list = [idx for idx in range(64) if legal_moves & (1 << idx)] # 遍历所有解 score_list = np.zeros(len(action_list), dtype=int) record_turn = env.epoch for i, action in enumerate(action_list): env.chessboard.black = black env.chessboard.white = white env.next_to_play = next_to_play env.epoch = record_turn env.done = False env.Result = None env.do(action) _, score = self._find_winning_move_and_score(env, exactly=exactly) score_list[i] = score if not exactly: if next_to_play == Stone.black and score > 0: # 找到一个就得 break elif next_to_play == Stone.white and score < 0: break best_action, best_score = ( action_list[int(np.argmax(score_list))], np.max(score_list)) if next_to_play == Stone.black else ( action_list[int(np.argmin(score_list))], np.min(score_list)) self.cache[key] = (best_action, best_score) return best_action, best_score
class EnvGui: def __init__(self, config: Config): self.config = config self.env = OthelloEnv().reset() self.ai = OthelloPlayer(self.config, _load_model(self.config), weight_table=WEIGHT_TABLE / 3, c=20, mc=True) # type: OthelloPlayer self.human_stone = None self.rev_function = None self.count_one_step = 0 self.count_all_step = 0 self.last_evaluation = None self.last_history = None # type: LastAcNQ self.last_ava = None self.action = None def start_game(self, human_is_black): # set color and env self.__init__(self.config) self.human_stone = Stone.black if human_is_black else Stone.white def play_next_turn(self): # update + ai_move / over self._do_move(1) # do over if self.env.done: self._do_move(3) return # do ai_move if self.env.next_to_play != self.human_stone: self._do_move(2) def _do_move(self, event): self.rev_function[event]() def add_observer(self, ob_map): self.rev_function = ob_map def stone(self, px, py): """left top=(0, 0), right bottom=(7,7)""" action = int(py * 8 + px) if self.env.chessboard.black & (1 << action): return 2 elif self.env.chessboard.white & (1 << action): return 1 else: return 0 def available(self, px, py): own, enemy = (self.env.chessboard.black, self.env.chessboard.white ) if self.env.next_to_play == Stone.black else ( self.env.chessboard.white, self.env.chessboard.black) action = int(py * 8 + px) if action < 0 or 64 <= action or (1<<action) & self.env.chessboard.black or (1<<action) & self.env.chessboard.white\ or not (1<<action) & find_correct_moves(own, enemy): return False return 1 def move(self, px, py): self.env.do(int(py * 8 + px)) def move_by_ai(self): own, enemy = (self.env.chessboard.black, self.env.chessboard.white ) if self.env.next_to_play == Stone.black else ( self.env.chessboard.white, self.env.chessboard.black) start = time.time() self.action = self.ai.think_and_play(own, enemy).action end = time.time() self.count_one_step = end - start self.count_all_step += self.count_one_step self.env.do(self.action) # notations self.last_history = self.ai.thinking_history self.last_evaluation = self.last_history.values[ self.last_history.action] self.last_ava = self.ai.avalable
def __get_next_key(self, own, enemy, action): env = OthelloEnv().update(own, enemy, Stone.black) env.do(action) return create_node(env)
async def ___recursive_simulation(self, env: OthelloEnv, is_root_node=False): "fertilize tree process" # get both keys node, another_side_node = create_both_nodes(env) if self.test_mode: if (node not in map.keys()): map[node] = env.epoch # return condition 1 if env.done: if env.result == Result.black: return 1 elif env.result == Result.white: return -1 else: return 0 # return condition 2 : get solver(大于50步,minmax) if env.epoch >= self.config.play.use_solver_turn_in_simulation: action, point = self.solver.solve(node.black, node.white, Stone(node.next_to_play), exactly=False) if action: point = point if env.next_to_play == Stone.black else -point leaf_v = np.sign(point) leaf_p = np.zeros(64) leaf_p[action] = 1 # update tree update_num_tree_with_one_or_moresides(self.num_tree, node, action, ["plus", "plus"], [1, 1]) #走过的位置+1 update_win_tree_with_one_or_moresides( self.win_tree, node, action, ["plus", "minus"], [leaf_v, leaf_v]) #走此步赢的次数+-1(win) update_policy_tree_with_one_or_moresides( self.policy_tree, node, ["set", "set"], [leaf_p, leaf_p]) #此节点应该走的位置(position) return np.sign(point) if time.time() - self.start_time >= 55: return 0 #return condition 3 : expand tree(小於等於50步,用深度學習) while node in self.now_expanding: # 兩個搜索綫程遇到同一個node,會有衝突的問題 await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf if node not in self.expanded: # reach leaf node leaf_v = await self.____expand_leaf_node(env) if env.next_to_play == Stone.black: return leaf_v # Value for black else: return -leaf_v # Value for white == -Value for black else: # not leaf do virtual_loss_for_w = self.config.play.virtual_loss if env.next_to_play == Stone.black else -self.config.play.virtual_loss action_t = self.____decide_action(env, is_root_node) #UCB公式 update_num_tree_with_one_or_moresides( self.num_tree, node, action_t, ["plus"], [self.config.play.virtual_loss]) update_win_tree_with_one_or_moresides(self.win_tree, node, action_t, ["minus"], [virtual_loss_for_w]) env.do(action_t) leaf_v = await self.___recursive_simulation(env) # next move # on returning search path update_num_tree_with_one_or_moresides( self.num_tree, node, action_t, ["plus", "plus"], [-self.config.play.virtual_loss + 1, 1]) update_win_tree_with_one_or_moresides( self.win_tree, node, action_t, ["plus", "minus"], [virtual_loss_for_w + leaf_v, leaf_v]) if self.test_mode: logger.warning(map[node], leaf_v) return leaf_v