async def search_my_move(self, env: GomokuEnv, is_root_node=False): """ Q, V is value for this Player(always white). P is value for the player of next_player (black or white) :param env: :param is_root_node: :return: """ if env.done: if env.winner == Winner.white: return 1 elif env.winner == Winner.black: return -1 else: return 0 key = self.counter_key(env) while key in self.now_expanding: await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf? if key not in self.expanded: # reach leaf node leaf_v = await self.expand_and_evaluate(env) if env.player_turn() == Player.white: return leaf_v # Value for white else: return -leaf_v # Value for white == -Value for white action_t = self.select_action_q_and_u(env, is_root_node) _, _ = env.step(action_t) # back propagate the values upward the search tree virtual_loss = self.config.play.virtual_loss self.var_n[key][action_t] += virtual_loss self.var_w[key][action_t] -= virtual_loss leaf_v = await self.search_my_move(env) # next move # on returning search path # update: N, W, Q, U if self.mem is not None: self.mem.update(key, action_t, leaf_v) n = self.var_n[key][ action_t] = self.var_n[key][action_t] - virtual_loss + 1 w = self.var_w[key][ action_t] = self.var_w[key][action_t] + virtual_loss + leaf_v q = w / n if self.mem is not None: q = (1.0 - self.beta) * w / n + (self.beta) * self.mem.get_amaf_q( key, action_t) #self.var_q[key][action_t] = (1.0 - self.beta) * w / n + (self.beta) * self.mem.get_amaf_q(key, action_t) self.var_q[key][action_t] = q return leaf_v
async def start_search_my_move(self, board, turn): self.running_simulation_num += 1 with await self.sem: # reduce parallel search number env = GomokuEnv().update(board, turn) leaf_v = await self.search_my_move(env, is_root_node=True) self.running_simulation_num -= 1 return leaf_v
def action(self, board, turn): env = GomokuEnv().update(board, turn) key = self.counter_key(env) for tl in range(self.play_config.thinking_loop): if tl > 0 and self.play_config.logging_thinking: logger.debug( f"continue thinking: policy move=({action % 8}, {action // 8}), " f"value move=({action_by_value % 8}, {action_by_value // 8})" ) self.search_moves(board, turn) policy = self.calc_policy(board, turn) action = int(np.random.choice(range(self.labels_n), p=policy)) action_by_value = int( np.argmax(self.var_q[key] + (self.var_n[key] > 0) * 100)) if action == action_by_value or env.turn < self.play_config.change_tau_turn: break # this is for play_gui, not necessary when training. self.thinking_history[env.observation] = HistoryItem( action, policy, list(self.var_q[key]), list(self.var_n[key])) self.moves.append([env.observation, list(policy)]) return action
def start(config: Config): PlayWithHumanConfig().update_play_config(config.play) gomoku_model = PlayWithHuman(config) while True: env = GomokuEnv().reset() human_is_black = random() < 0.5 gomoku_model.start_game(human_is_black) while not env.done: if env.player_turn() == Player.black: if not human_is_black: action = gomoku_model.move_by_ai(env) print("IA moves to: " + str(action)) else: action = gomoku_model.move_by_human(env) print("You move to: " + str(action)) else: if human_is_black: action = gomoku_model.move_by_ai(env) print("IA moves to: " + str(action)) else: action = gomoku_model.move_by_human(env) print("You move to: " + str(action)) env.step(action) env.render() print("\nEnd of the game.") print("Game result:") if env.winner == Winner.white: print("X wins") elif env.winner == Winner.black: print("O wins") else: print("Game was a draw")
def convert_to_training_data(data): """ Helper function to convert saved data to training data format :param data: format is SelfPlayWorker.buffer :return: """ state_list = [] policy_list = [] z_list = [] for state, policy, z in data: board = list(state) board = np.reshape(board, (8, 5)) env = GomokuEnv().update(board, 0) black_ary, white_ary = env.black_and_white_plane() state = [black_ary, white_ary] if env.player_turn() == Player.black else [white_ary, black_ary] state_list.append(state) policy_list.append(policy) z_list.append(z) return np.array(state_list), np.array(policy_list), np.array(z_list)
def calc_policy(self, board, turn): """compute π(a|s0) :return: """ pc = self.play_config env = GomokuEnv().update(board, turn) key = self.counter_key(env) if env.turn < pc.change_tau_turn: return self.var_n[key] / np.sum(self.var_n[key]) # tau = 1 else: action = np.argmax(self.var_n[key]) # tau = 0 ret = np.zeros(self.labels_n) ret[action] = 1 return ret
def play_game(self, best_model, ng_model): ''' Plays a single game between the best model and candidate model''' env = GomokuEnv().reset() best_player = GomokuPlayer(self.config, best_model, play_config=self.config.eval.play_config) ng_player = GomokuPlayer(self.config, ng_model, play_config=self.config.eval.play_config) best_is_white = random() < 0.5 if not best_is_white: black, white = best_player, ng_player else: black, white = ng_player, best_player env.reset() while not env.done: if env.player_turn() == Player.black: action = black.action(env.board, env.turn) else: action = white.action(env.board, env.turn) env.step(action) # record the winner ng_win = None if env.winner == Winner.white: if best_is_white: ng_win = 0 else: ng_win = 1 elif env.winner == Winner.black: if best_is_white: ng_win = 1 else: ng_win = 0 return ng_win, best_is_white
def start(config: Config): tf_util.set_session_config(per_process_gpu_memory_fraction=0.2) return SelfPlayWorker(config, env=GomokuEnv()).start()