def play_game(self, best_model, ng_model): env = ReversiEnv().reset() best_player = ReversiPlayer(self.config, best_model, play_config=self.config.eval.play_config) ng_player = ReversiPlayer(self.config, ng_model, play_config=self.config.eval.play_config) best_is_black = random() < 0.5 if best_is_black: black, white = best_player, ng_player else: black, white = ng_player, best_player observation = env.observation while not env.done: if env.next_player == Player.black: action = black.action(observation.black, observation.white) else: action = white.action(observation.white, observation.black) observation, info = env.step(action) ng_win = None if env.winner == Winner.black: if best_is_black: ng_win = 0 else: ng_win = 1 elif env.winner == Winner.white: if best_is_black: ng_win = 1 else: ng_win = 0 return ng_win, best_is_black, observation.number_of_black_and_white
def __init__(self, config: Config, model_dir): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model(model_dir) self.ai = None # type: EvaluatePlayer self.ai_confidence = None
def __init__(self, config: Config): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model() self.ai = None # type: ReversiPlayer self.last_evaluation = None self.last_history = None # type: HistoryItem
async def start_search_my_move(self, own, enemy): self.running_simulation_num += 1 root_key = self.counter_key(ReversiEnv().update(own, enemy, Player.black)) with await self.sem: # reduce parallel search number env = ReversiEnv().update(own, enemy, Player.black) leaf_v = await self.search_my_move(env, is_root_node=True) self.running_simulation_num -= 1 if self.callback_in_mtcs and self.callback_in_mtcs.per_sim > 0 and \ self.running_simulation_num % self.callback_in_mtcs.per_sim == 0: self.callback_in_mtcs.callback(list(self.var_q[root_key]), list(self.var_n[root_key])) return leaf_v
def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() def make_sim_env_fn(): return self.env.copy() self.ai = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=self.model) self.ai.prepare(self.env, dir_noise=False) self.ai_confidence = None
def __init__(self, config: Config): self.config = config self.reader = NonBlockingStreamReader(sys.stdin) self.handler = NBoardProtocolVersion2(config, self) self.running = False self.nc = self.config.nboard # shorcut # self.env = ReversiEnv().reset() self.model = load_model(self.config) self.play_config = self.config.play self.player = self.create_player() self.turn_of_nboard = None
def action(self, own, enemy): """ :param own: BitBoard :param enemy: BitBoard :return: action: move pos=0 ~ 63 (0=top left, 7 top right, 63 bottom right) """ env = ReversiEnv().update(own, enemy, Player.black) key = self.counter_key(env) for tl in range(self.play_config.thinking_loop): if tl > 0 and self.play_config.logging_thinking: logger.debug( f"continue thinking: policy move=({action % 8}, {action // 8}), " f"value move=({action_by_value % 8}, {action_by_value // 8})" ) self.search_moves(own, enemy) policy = self.calc_policy(own, enemy) action = int(np.random.choice(range(64), p=policy)) action_by_value = int( np.argmax(self.var_q[key] + (self.var_n[key] > 0) * 100)) if action == action_by_value or env.turn < self.play_config.change_tau_turn: break self.moves.append([(own, enemy), list(policy)]) # this is for play_gui, not necessary when training. self.thinking_history[(own, enemy)] = HistoryItem(action, policy, list(self.var_q[key]), list(self.var_n[key])) return action
async def start_search_my_move(self, own, enemy): self.running_simulation_num += 1 with await self.sem: # reduce parallel search number env = ReversiEnv().update(own, enemy, Player.black) leaf_v = await self.search_my_move(env, is_root_node=True) self.running_simulation_num -= 1 return leaf_v
def action(self, own, enemy): """ :param own: BitBoard :param enemy: BitBoard :return: action: move pos=0 ~ 63 (0=top left, 7 top right, 63 bottom right) """ env = ReversiEnv().update(own, enemy, Player.black) key = self.counter_key(env) for tl in range(self.play_config.thinking_loop): if tl > 0 and self.play_config.logging_thinking: logger.debug(f"continue thinking: policy move=({action % 8}, {action // 8}), " f"value move=({action_by_value % 8}, {action_by_value // 8})") self.search_moves(own, enemy) policy = self.calc_policy(own, enemy) action = int(np.random.choice(range(64), p=policy)) action_by_value = int(np.argmax(self.var_q[key] + (self.var_n[key] > 0)*100)) if action == action_by_value or env.turn < self.play_config.change_tau_turn or env.turn <= 1: break # this is for play_gui, not necessary when training. next_key = self.get_next_key(own, enemy, action) self.thinking_history[(own, enemy)] = HistoryItem(action, policy, list(self.var_q[key]), list(self.var_n[key]), list(self.var_q[next_key]), list(self.var_n[next_key])) if self.play_config.resign_threshold is not None and \ np.max(self.var_q[key] - (self.var_n[key] == 0)*10) <= self.play_config.resign_threshold: self.resigned = True if self.enable_resign: return None # means resign saved_policy = self.calc_policy_by_tau_1(key) if self.config.play_data.save_policy_of_tau_1 else policy self.moves.append([(own, enemy), list(saved_policy)]) return action
def action_with_evaluation(self, own, enemy, callback_in_mtcs=None): """ :param own: BitBoard :param enemy: BitBoard :param CallbackInMCTS callback_in_mtcs: :rtype: ActionWithEvaluation :return ActionWithEvaluation( action=move pos=0 ~ 224 (0=top left, 14 top right, 224 bottom right), n=N of the action, q=W/N of the action, ) """ env = ReversiEnv().update(own, enemy, Player.black) key = self.counter_key(env) self.callback_in_mtcs = callback_in_mtcs pc = self.play_config if pc.use_solver_turn and env.turn >= pc.use_solver_turn: ret = self.action_by_searching(key) if ret: # not save move as play data return ret for tl in range(self.play_config.thinking_loop): if env.turn > 0: self.search_moves(own, enemy) else: self.bypass_first_move(key) policy = self.calc_policy(own, enemy) action = int(np.random.choice(range(225), p=policy)) action_by_value = int(np.argmax(self.var_q(key) + (self.var_n[key] > 0)*100)) value_diff = self.var_q(key)[action] - self.var_q(key)[action_by_value] if env.turn <= pc.start_rethinking_turn or self.requested_stop_thinking or \ (value_diff > -0.01 and self.var_n[key][action] >= pc.required_visit_to_decide_action): break # this is for play_gui, not necessary when training. self.update_thinking_history(own, enemy, action, policy) if self.play_config.resign_threshold is not None and\ np.max(self.var_q(key) - (self.var_n[key] == 0)*10) <= self.play_config.resign_threshold: self.resigned = True if self.enable_resign: if env.turn >= self.config.play.allowed_resign_turn: return ActionWithEvaluation(None, 0, 0) # means resign else: logger.debug(f"Want to resign but disallowed turn {env.turn} < {self.config.play.allowed_resign_turn}") saved_policy = self.calc_policy_by_tau_1(key) if self.config.play_data.save_policy_of_tau_1 else policy self.add_data_to_move_buffer_with_8_symmetries(own, enemy, saved_policy) return ActionWithEvaluation(action=action, n=self.var_n[key][action], q=self.var_q(key)[action])
def play_game(self, model_1, model_2): env = ReversiEnv().reset() def make_sim_env_fn(): return env.copy() p1 = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=model_1, play_config=self.config.eval.play_config) p1.prepare(env, dir_noise=False) p2 = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=model_2, play_config=self.config.eval.play_config) p2.prepare(env, dir_noise=False) p1_is_black = random() < 0.5 if p1_is_black: black, white = p1, p2 else: black, white = p2, p1 while not env.done: if env.next_player == Player.black: action, _, _ = black.think() else: action, _, _ = white.think() env.step(action) black.play(action, env) white.play(action, env) if env.black_wins: p1_win = p1_is_black elif env.black_loses: p1_win = not p1_is_black else: p1_win = None return p1_win
async def search_my_move(self, env: ReversiEnv, is_root_node=False): """ Q, V is value for this Player(always black). P is value for the player of next_player (black or white) :param env: :param is_root_node: :return: """ if env.done: if env.winner == Winner.black: return 1 elif env.winner == Winner.white: return -1 else: return 0 key = self.counter_key(env) another_side_key = self.another_side_counter_key(env) while key in self.now_expanding: await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf? if key not in self.expanded: # reach leaf node leaf_v = await self.expand_and_evaluate(env) if env.next_player == Player.black: return leaf_v # Value for black else: return -leaf_v # Value for white == -Value for black virtual_loss = self.config.play.virtual_loss virtual_loss_for_w = virtual_loss if env.next_player == Player.black else -virtual_loss action_t = self.select_action_q_and_u(env, is_root_node) _, _ = env.step(action_t) self.var_n[key][action_t] += virtual_loss self.var_w[key][action_t] -= virtual_loss_for_w leaf_v = await self.search_my_move(env) # next move # on returning search path # update: N, W self.var_n[key][action_t] += - virtual_loss + 1 self.var_w[key][action_t] += virtual_loss_for_w + leaf_v # update another side info(flip color and player) self.var_n[another_side_key][action_t] += 1 self.var_w[another_side_key][action_t] -= leaf_v # must flip the sign. return leaf_v
def solve(self, black, white, next_player, timeout=30, exactly=False): self.timeout = timeout self.start_time = time() if not self.last_is_exactly and exactly: self.cache = {} self.last_is_exactly = exactly try: # logger.debug("start resolving") move, score = self.find_winning_move_and_score(ReversiEnv().update(black, white, next_player), exactly=exactly) if next_player == Player.white: score = -score # logger.debug(f"solve answer=({move},{score})({time()-self.start_time:.3f} seconds)") return move, score except Timeout: return None, None
async def search_my_move(self, env: ReversiEnv, is_root_node=False): """ Q, V is value for this Player(always black). P is value for the player of next_player (black or white) :param env: :param is_root_node: :return: """ if env.done: if env.winner == Winner.black: return 1 elif env.winner == Winner.white: return -1 else: return 0 key = self.counter_key(env) while key in self.now_expanding: await asyncio.sleep(self.config.play.wait_for_expanding_sleep_sec) # is leaf? if key not in self.expanded: # reach leaf node leaf_v = await self.expand_and_evaluate(env) if env.next_player == Player.black: return leaf_v # Value for black else: return -leaf_v # Value for white == -Value for black action_t = self.select_action_q_and_u(env, is_root_node) _, _ = env.step(action_t) virtual_loss = self.config.play.virtual_loss self.var_n[key][action_t] += virtual_loss self.var_w[key][action_t] -= virtual_loss leaf_v = await self.search_my_move(env) # next move # on returning search path # update: N, W, Q, U n = self.var_n[key][ action_t] = self.var_n[key][action_t] - virtual_loss + 1 w = self.var_w[key][ action_t] = self.var_w[key][action_t] + virtual_loss + leaf_v self.var_q[key][action_t] = w / n return leaf_v
def calc_policy(self, own, enemy): """calc π(a|s0) :param own: :param enemy: :return: """ pc = self.play_config env = ReversiEnv().update(own, enemy, Player.black) key = self.counter_key(env) if env.turn < pc.change_tau_turn: return self.calc_policy_by_tau_1(key) else: action = np.argmax(self.var_n[key]) # tau = 0 ret = np.zeros(64) ret[action] = 1 return ret
def action(self, own, enemy): """ :param own: BitBoard :param enemy: BitBoard :return: action: move pos=0 ~ 63 (0=top left, 7 top right, 63 bottom right) """ self.search_moves(own, enemy) policy = self.calc_policy(own, enemy) self.moves.append([(own, enemy), list(policy)]) action = int(np.random.choice(range(64), p=policy)) # this is for play_gui, not necessary when training. env = ReversiEnv().update(own, enemy, Player.black) key = self.counter_key(env) self.thinking_history[(own, enemy)] = HistoryItem(action, policy, list(self.var_q[key]), list(self.var_n[key])) return action
def find_winning_move_and_score(self, env: ReversiEnv, exactly=True): if env.done: b, w = env.board.number_of_black_and_white return None, b - w if time() - self.start_time > self.timeout: logger.debug("timeout!") raise Timeout() turn = env.turn key = black, white, next_player = env.board.black, env.board.white, env.next_player if key in self.cache: return self.cache[key] if next_player == Player.black: legal_moves = find_correct_moves(black, white) else: legal_moves = find_correct_moves(white, black) action_list = [idx for idx in range(225) if legal_moves & (1 << idx)] score_list = np.zeros(len(action_list), dtype=int) for i, action in enumerate(action_list): # env.update(black, white, next_player) env.board.black = black env.board.white = white env.next_player = next_player env.turn = turn env.done = False env.winner = None # env.step(action) _, score = self.find_winning_move_and_score(env, exactly=exactly) score_list[i] = score if not exactly: # do not need to find the best score move if next_player == Player.black and score > 0: break elif next_player == Player.white and score < 0: break # print(list(zip(action_list, score_list))) if next_player == Player.black: best_action = action_list[int(np.argmax(score_list))] best_score = np.max(score_list) else: best_action = action_list[int(np.argmin(score_list))] best_score = np.min(score_list) self.cache[key] = (best_action, best_score) return best_action, best_score
def start(config: Config): tf_util.set_session_config(per_process_gpu_memory_fraction=0.3) api_server = MultiProcessReversiModelAPIServer(config) process_num = config.play_data.multi_process_num api_server.start_serve() with Manager() as manager: shared_var = SharedVar( manager, game_idx=read_as_int(config.resource.self_play_game_idx_file) or 0) with ProcessPoolExecutor(max_workers=process_num) as executor: futures = [] for i in range(process_num): play_worker = SelfPlayWorker(config, env=ReversiEnv(), api=api_server.get_api_client(), shared_var=shared_var, worker_index=i) futures.append(executor.submit(play_worker.start))
def start(config: Config): tf_util.set_session_config(allow_growth=True) # api_server_list = [MultiProcessReversiModelAPIServer(config) for i in range(config.model.num_gpus)] api_server = MultiProcessReversiModelAPIServer(config) api_server.start_serve() process_num = config.play_data.multi_process_num # for i in range(config.model.num_gpus): # api_server_list[i].start_serve(i) # print(f'Create server on GPU#{i}') with Manager() as manager: shared_var = SharedVar( manager, game_idx=read_as_int(config.resource.self_play_game_idx_file) or 0) with ProcessPoolExecutor(max_workers=process_num) as executor: futures = [] for i in range(process_num): play_worker = SelfPlayWorker(config, env=ReversiEnv(), api=api_server.get_api_client(), shared_var=shared_var, worker_index=i) futures.append(executor.submit(play_worker.start))
def generate_train_data(self, batch_size): env = ReversiEnv() # The AZ paper doesn't leverage the symmetric observation data augmentation. But it is nice to use it if we can. symmetric_n = env.rotate_flip_op_count while True: orig_data_size = self.dataset.size data_size = orig_data_size * symmetric_n if symmetric_n > 1 else orig_data_size x, lm, y1, y2 = [], [], [], [] for _ in range(batch_size): n = randint(0, data_size - 1) orig_n = n // symmetric_n if symmetric_n > 1 else n file_name, offset = self.dataset.locate(orig_n) state, policy, legal_moves, z = self.loaded_data[file_name][offset] state = env.decompress_ob(state) if symmetric_n > 1: op = n % symmetric_n state = env.rotate_flip_ob(state, op) policy = env.rotate_flip_pi(policy, op) legal_moves = env.rotate_flip_pi(legal_moves, op) state = np.transpose(state, [1, 2, 0]) x.append(state) lm.append(legal_moves) y1.append(policy) y2.append([z]) x = np.asarray(x) lm = np.asarray(lm) y1 = np.asarray(y1) y2 = np.asarray(y2) yield x, lm, y1, y2
class PlayWithHuman: def __init__(self, config: Config): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model() self.ai = None # type: ReversiPlayer self.last_evaluation = None self.last_history = None # type: HistoryItem def add_observer(self, observer_func): self.observers.append(observer_func) def notify_all(self, event): for ob_func in self.observers: ob_func(event) def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() self.ai = ReversiPlayer(self.config, self.model) def play_next_turn(self): self.notify_all(GameEvent.update) if self.over: self.notify_all(GameEvent.over) return if self.next_player != self.human_color: self.notify_all(GameEvent.ai_move) @property def over(self): return self.env.done @property def next_player(self): return self.env.next_player def stone(self, px, py): """left top=(0, 0), right bottom=(7,7)""" pos = int(py * 8 + px) assert 0 <= pos < 64 bit = 1 << pos if self.env.board.black & bit: return Player.black elif self.env.board.white & bit: return Player.white return None @property def number_of_black_and_white(self): return self.env.observation.number_of_black_and_white def available(self, px, py): pos = int(py * 8 + px) if pos < 0 or 64 <= pos: return False own, enemy = self.env.board.black, self.env.board.white if self.human_color == Player.white: own, enemy = enemy, own legal_moves = find_correct_moves(own, enemy) return legal_moves & (1 << pos) def move(self, px, py): pos = int(py * 8 + px) assert 0 <= pos < 64 if self.next_player != self.human_color: return False self.env.step(pos) def _load_model(self): from reversi_zero.agent.model import ReversiModel model = ReversiModel(self.config) if self.config.play.use_newest_next_generation_model: loaded = reload_newest_next_generation_model_if_changed( model) or load_best_model_weight(model) else: loaded = load_best_model_weight( model) or reload_newest_next_generation_model_if_changed(model) if not loaded: raise RuntimeError("No models found!") return model def move_by_ai(self): if self.next_player == self.human_color: return False own, enemy = self.get_state_of_next_player() action = self.ai.action(own, enemy) self.env.step(action) self.last_history = self.ai.ask_thought_about(own, enemy) self.last_evaluation = self.last_history.values[ self.last_history.action] logger.debug(f"evaluation by ai={self.last_evaluation}") def get_state_of_next_player(self): if self.next_player == Player.black: own, enemy = self.env.board.black, self.env.board.white else: own, enemy = self.env.board.white, self.env.board.black return own, enemy
def start(config: Config): tf_util.set_session_config(per_process_gpu_memory_fraction=None, allow_growth=True) return SelfPlayWorker(config, env=ReversiEnv()).start()
def get_next_key(self, own, enemy, action): env = ReversiEnv().update(own, enemy, Player.black) env.step(action) return self.counter_key(env)
def start(config: Config, gpu_mem_frac=None): if gpu_mem_frac is not None: config.model.gpu_mem_frac = gpu_mem_frac return SelfPlayWorker(config, env=ReversiEnv()).start()
class NBoardEngine: def __init__(self, config: Config): self.config = config self.reader = NonBlockingStreamReader(sys.stdin) self.handler = NBoardProtocolVersion2(config, self) self.running = False self.nc = self.config.nboard # shorcut # self.env = ReversiEnv().reset() self.model = load_model(self.config) self.play_config = self.config.play self.player = self.create_player() self.turn_of_nboard = None def create_player(self): logger.debug("create new ReversiPlayer()") return ReversiPlayer(self.config, self.model, self.play_config, enable_resign=False) def start(self): self.running = True self.reader.start(push_callback=self.push_callback) while self.running: message = self.reader.readline(self.nc.read_stdin_timeout) if message is None: continue message = message.strip() logger.debug(f"> {message}") self.handler.handle_message(message) def push_callback(self, message: str): # note: called in another thread if message.startswith("ping"): # interupt self.stop_thinkng() def stop(self): self.running = False def reply(self, message): logger.debug(f"< {message}") sys.stdout.write(message + "\n") sys.stdout.flush() def stop_thinkng(self): self.player.stop_thinking() def set_depth(self, n): try: n = int(n) self.play_config.simulation_num_per_move = n * self.nc.simulation_num_per_depth_about logger.info( f"set simulation_num_per_move to {self.play_config.simulation_num_per_move}" ) except ValueError: pass def reset_state(self): self.player = self.create_player() def set_game(self, game_state: GameState): self.env.reset() self.env.update(game_state.black, game_state.white, game_state.player) self.turn_of_nboard = game_state.player for action in game_state.actions: self._change_turn() if action is not None: self.env.step(action) def _change_turn(self): if self.turn_of_nboard: self.turn_of_nboard = Player.black if self.turn_of_nboard == Player.white else Player.white def move(self, action): self._change_turn() if action is not None: self.env.step(action) def go(self) -> GoResponse: if self.env.next_player != self.turn_of_nboard: return GoResponse(None, 0, 0) board = self.env.board if self.env.next_player == Player.black: states = (board.black, board.white) else: states = (board.white, board.black) start_time = time() action = self.player.action(*states) item = self.player.ask_thought_about(*states) evaluation = item.values[action] time_took = time() - start_time return GoResponse(action, evaluation, time_took) def hint(self, n_hint): """ :param n_hint: """ board = self.env.board if self.env.next_player == Player.black: states = (board.black, board.white) else: states = (board.white, board.black) def hint_report_callback(values, visits): hint_list = [] for action, visit in list( sorted(enumerate(visits), key=lambda x: -x[1]))[:n_hint]: if visit > 0: hint_list.append( HintResponse(action, values[action], visit)) self.handler.report_hint(hint_list) callback_info = CallbackInMCTS( self.config.nboard.hint_callback_per_sim, hint_report_callback) self.player.action(*states, callback_in_mtcs=callback_info) item = self.player.ask_thought_about(*states) hint_report_callback(item.values, item.visit)
class PlayWithHuman: def __init__(self, config: Config): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model() self.ai = None # type: ReversiPlayer self.last_evaluation = None self.last_history = None # type: HistoryItem def add_observer(self, observer_func): self.observers.append(observer_func) def notify_all(self, event): for ob_func in self.observers: ob_func(event) def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() self.ai = ReversiPlayer(self.config, self.model) def play_next_turn(self): self.notify_all(GameEvent.update) if self.over: self.notify_all(GameEvent.over) return if self.next_player != self.human_color: self.notify_all(GameEvent.ai_move) @property def over(self): return self.env.done @property def next_player(self): return self.env.next_player def stone(self, px, py): """left top=(0, 0), right bottom=(14,14)""" pos = int(py * 15 + px) assert 0 <= pos < 225 bit = 1 << pos if self.env.board.black & bit: return Player.black elif self.env.board.white & bit: return Player.white return None @property def number_of_black_and_white(self): return self.env.observation.number_of_black_and_white def available(self, px, py): pos = int(py * 15 + px) if pos < 0 or 225 <= pos: return False own, enemy = self.env.board.black, self.env.board.white if self.human_color == Player.white: own, enemy = enemy, own legal_moves = find_correct_moves(own, enemy) return legal_moves & (1 << pos) def move(self, px, py): pos = int(py * 15 + px) assert 0 <= pos < 225 if self.next_player != self.human_color: return False self.env.step(pos) def _load_model(self): return load_model(self.config) def move_by_ai(self): if self.next_player == self.human_color: return False own, enemy = self.get_state_of_next_player() action = self.ai.action(own, enemy) self.env.step(action) self.last_history = self.ai.ask_thought_about(own, enemy) self.last_evaluation = self.last_history.values[self.last_history.action] logger.debug(f"evaluation by ai={self.last_evaluation}") def get_state_of_next_player(self): if self.next_player == Player.black: own, enemy = self.env.board.black, self.env.board.white else: own, enemy = self.env.board.white, self.env.board.black return own, enemy
def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() self.ai = ReversiPlayer(self.config, self.model)
class PlayWithHuman: def __init__(self, config: Config, model_dir): self.config = config self.human_color = None self.observers = [] self.env = ReversiEnv().reset() self.model = self._load_model(model_dir) self.ai = None # type: EvaluatePlayer self.ai_confidence = None def add_observer(self, observer_func): self.observers.append(observer_func) def notify_all(self, event): for ob_func in self.observers: ob_func(event) def start_game(self, human_is_black): self.human_color = Player.black if human_is_black else Player.white self.env = ReversiEnv().reset() def make_sim_env_fn(): return self.env.copy() self.ai = EvaluatePlayer(make_sim_env_fn=make_sim_env_fn, config=self.config, model=self.model) self.ai.prepare(self.env, dir_noise=False) self.ai_confidence = None def play_next_turn(self): self.notify_all(GameEvent.update) if self.over: self.notify_all(GameEvent.over) return if self.next_player != self.human_color: self.notify_all(GameEvent.ai_move) elif np.amax(self.env.legal_moves) == 0: # pass print('pass move') pos = 64 self.env.step(pos) self.ai.play(pos, self.env) @property def over(self): return self.env.done @property def next_player(self): return self.env.next_player def stone(self, px, py): """left top=(0, 0), right bottom=(7,7)""" pos = int(py * 8 + px) assert 0 <= pos < 64 bit = 1 << pos if self.env.board.black & bit: return Player.black elif self.env.board.white & bit: return Player.white return None @property def number_of_black_and_white(self): return self.env.board.number_of_black_and_white def available(self, px, py): pos = int(py * 8 + px) if pos < 0 or 64 <= pos: return False own, enemy = self.env.board.black, self.env.board.white if self.human_color == Player.white: own, enemy = enemy, own legal_moves = find_correct_moves(own, enemy) return legal_moves & (1 << pos) def move(self, px, py): pos = int(py * 8 + px) assert 0 <= pos < 64 if self.next_player != self.human_color: raise Exception('not human\'s turn!') self.env.step(pos) self.ai.play(pos, self.env) def _load_model(self, model_dir): from reversi_zero.agent.model import ReversiModel model = ReversiModel(self.config) model.create_session() model.load(model_dir) return model def move_by_ai(self): if self.next_player == self.human_color: raise Exception('not AI\'s turn!') logger.info('start thinking...') action, _, vs = self.ai.think() self.ai_confidence = vs logger.info('end thinking...') self.env.step(action) self.ai.play(action, self.env) def get_state_of_next_player(self): if self.next_player == Player.black: own, enemy = self.env.board.black, self.env.board.white else: own, enemy = self.env.board.white, self.env.board.black return own, enemy
def start(config: Config): tf_util.set_session_config(per_process_gpu_memory_fraction=0.3) return SelfPlayWorker(config, env=ReversiEnv()).start()