async def start_search_my_move(self, board): self.running_simulation_num += 1 with await self.sem: # reduce parallel search number env = Connect4Env().update(board) leaf_v = await self.search_my_move(env, is_root_node=True) self.running_simulation_num -= 1 return leaf_v
def action(self, board): env = Connect4Env().update(board) key = self.counter_key(env) for tl in range(self.play_config.thinking_loop): if tl > 0 and self.play_config.logging_thinking: logger.debug( f"continue thinking: policy move=({action % 8}, {action // 8}), " f"value move=({action_by_value % 8}, {action_by_value // 8})" ) self.search_moves(board) policy = self.calc_policy(board) action = int(np.random.choice(range(self.labels_n), p=policy)) action_by_value = int( np.argmax(self.var_q[key] + (self.var_n[key] > 0) * 100)) if action == action_by_value or env.turn < self.play_config.change_tau_turn: break # this is for play_gui, not necessary when training. self.thinking_history[env.observation] = HistoryItem( action, policy, list(self.var_q[key]), list(self.var_n[key])) self.moves.append([env.observation, list(policy)]) return action
def start(config: Config): PlayWithHumanConfig().update_play_config(config.play) connect4_model = PlayWithHuman(config) while True: env = Connect4Env().reset() human_is_black = random() < 0.5 connect4_model.start_game(human_is_black) while not env.done: if env.player_turn() == Player.black: if not human_is_black: action = connect4_model.move_by_ai(env) print("IA moves to: " + str(action + 1)) else: action = connect4_model.move_by_human(env) print("You move to: " + str(action + 1)) else: if human_is_black: action = connect4_model.move_by_ai(env) print("IA moves to: " + str(action + 1)) else: action = connect4_model.move_by_human(env) print("You move to: " + str(action + 1)) env.step(action) env.render() print("\nEnd of the game.") print("Game result:") if env.winner == Winner.white: print("X wins") elif env.winner == Winner.black: print("O wins") else: print("Game was a draw")
def play_game(self, best_model, ng_model): env = Connect4Env().reset() best_player = Connect4Player(self.config, best_model, play_config=self.config.eval.play_config) ng_player = Connect4Player(self.config, ng_model, play_config=self.config.eval.play_config) best_is_white = random() < 0.5 if not best_is_white: black, white = best_player, ng_player else: black, white = ng_player, best_player env.reset() while not env.done: if env.player_turn() == Player.black: action = black.action(env.board) else: action = white.action(env.board) env.step(action) ng_win = None if env.winner == Winner.white: if best_is_white: ng_win = 0 else: ng_win = 1 elif env.winner == Winner.black: if best_is_white: ng_win = 1 else: ng_win = 0 return ng_win, best_is_white
async def search_my_move(self, env: Connect4Env, is_root_node=False): """ Q, V is value for this Player(always white). P is value for the player of next_player (black or white) :param env: :param is_root_node: :return: """ if env.done: if env.winner == Winner.white: return 1 elif env.winner == Winner.black: return -1 else: return 0 key = self.counter_key(env) while key in self.now_expanding: await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf? if key not in self.expanded: # reach leaf node leaf_v = await self.expand_and_evaluate(env) if env.player_turn() == Player.white: return leaf_v # Value for white else: return -leaf_v # Value for white == -Value for white action_t = self.select_action_q_and_u(env, is_root_node) _, _ = env.step(action_t) virtual_loss = self.config.play.virtual_loss self.var_n[key][action_t] += virtual_loss self.var_w[key][action_t] -= virtual_loss leaf_v = await self.search_my_move(env) # next move # on returning search path # update: N, W, Q, U n = self.var_n[key][ action_t] = self.var_n[key][action_t] - virtual_loss + 1 w = self.var_w[key][ action_t] = self.var_w[key][action_t] + virtual_loss + leaf_v self.var_q[key][action_t] = w / n return leaf_v
def calc_policy(self, board): """calc π(a|s0) :return: """ pc = self.play_config env = Connect4Env().update(board) key = self.counter_key(env) if env.turn < pc.change_tau_turn: return self.var_n[key] / np.sum(self.var_n[key]) # tau = 1 else: action = np.argmax(self.var_n[key]) # tau = 0 ret = np.zeros(self.labels_n) ret[action] = 1 return ret
def convert_to_training_data(data): """ :param data: format is SelfPlayWorker.buffer :return: """ state_list = [] policy_list = [] z_list = [] for state, policy, z in data: board = list(state) board = np.reshape(board, (6, 7)) env = Connect4Env().update(board) black_ary, white_ary = env.black_and_white_plane() state = [ black_ary, white_ary ] if env.player_turn() == Player.black else [white_ary, black_ary] state_list.append(state) policy_list.append(policy) z_list.append(z) return np.array(state_list), np.array(policy_list), np.array(z_list)