def play(): env = OmokEnv(BOARD_SIZE, HISTORY) mcts = MCTS(BOARD_SIZE, HISTORY, N_SIMUL) result = {'Black': 0, 'White': 0, 'Draw': 0} for g in range(GAME): print('#' * (BOARD_SIZE - 4), ' GAME: {} '.format(g + 1), '#' * (BOARD_SIZE - 4)) # reset state state, board = env.reset() done = False while not done: env.render() # start simulations action = mcts.get_action(state, board) state, board, z, done = env.step(action) if done: if z == 1: result['Black'] += 1 elif z == -1: result['White'] += 1 else: result['Draw'] += 1 # render & reset tree env.render() mcts.reset_tree() # result blw, whw, drw = result['Black'], result['White'], result['Draw'] print('') print('=' * 20, " {} Game End ".format(blw + whw + drw), '=' * 20) stats = ( 'Black Win: {} White Win: {} Draw: {} Winrate: {:.2f}%'.format( blw, whw, drw, (blw + 0.5 * drw) / (blw + whw + drw) * 100)) print(stats, '\n')
def main(): env = OmokEnv(BOARD_SIZE, HISTORY) manager = HumanUI() result = {'Black': 0, 'White': 0, 'Draw': 0} for g in range(GAME): print('########## Game: {} ##########'.format(g + 1)) state, board = env.reset() done = False idx = 0 while not done: env.render() # start simulations action = manager.get_action(state, board, idx) state, board, z, done = env.step(action) idx += 1 if done: if z == 1: result['Black'] += 1 elif z == -1: result['White'] += 1 else: result['Draw'] += 1 # render & reset tree env.render() manager.ai.reset_tree() # result print('') print('=' * 20, " {} Game End ".format(g + 1), '=' * 20) blw, whw, drw = result['Black'], result['White'], result['Draw'] stat = ( 'Black Win: {} White Win: {} Draw: {} Winrate: {:0.1f}%'.format( blw, whw, drw, 1 / (1 + np.exp(whw / (g + 1)) / np.exp(blw / (g + 1))) * 100)) print(stat, '\n')
def play(): USE_CUDA = torch.cuda.is_available() env = OmokEnv(BOARD_SIZE, HISTORY) manager = HumanUI() if USE_CUDA: manager.ai.model.cuda() model_path = None if model_path: print('load model: {}\n'.format(model_path)) manager.ai.model.load_state_dict(torch.load(model_path)) result = {'Black': 0, 'White': 0, 'Draw': 0} for g in range(N_GAME): print('##### Game: {} #####'.format(g + 1)) state, board = env.reset() done = False idx = 0 while not done: env.render() # start simulations action = manager.get_action(state, board, idx) state, board, z, done = env.step(action) idx += 1 if done: if z == 1: result['Black'] += 1 elif z == -1: result['White'] += 1 else: result['Draw'] += 1 # render & reset tree env.render() manager.ai.reset_tree() # result blw, whw, drw = result['Black'], result['White'], result['Draw'] print('') print('=' * 20, " {} Game End ".format(blw + whw + drw), '=' * 20) stats = ( 'Black Win: {} White Win: {} Draw: {} Winrate: {:.2f}%'.format( blw, whw, drw, (blw + 0.5 * drw) / (blw + whw + drw) * 100)) print(stats, '\n')
class MCTS: def __init__(self, board_size, n_history, n_simul): self.env_simul = OmokEnv(board_size, n_history, display=False) self.board_size = board_size self.n_simul = n_simul self.tree = None self.root = None self.state = None self.board = None self.legal_move = None self.no_legal_move = None self.ucb = None # used for backup self.key_memory = None self.action_memory = None # init self._reset() self.reset_tree() def _reset(self): self.key_memory = deque(maxlen=self.board_size**2) self.action_memory = deque(maxlen=self.board_size**2) def reset_tree(self): self.tree = defaultdict( lambda: np.zeros((self.board_size**2, 2), 'float')) def get_action(self, state, board): self.root = state.copy() self._simulation(state) # init root board after simulatons self.board = board board_fill = self.board[CURRENT] + self.board[OPPONENT] self.legal_move = np.argwhere(board_fill == 0).flatten() self.no_legal_move = np.argwhere(board_fill != 0).flatten() # root state's key root_key = hash(self.root.tostring()) # argmax Q action = self._selection(root_key, c_ucb=0) print('') print(self.ucb.reshape( self.board_size, self.board_size).round(decimals=4)) return action def _simulation(self, state): start = time.time() finish = 0 for sim in range(self.n_simul): print('\rsimulation: {}'.format(sim + 1), end='') sys.stdout.flush() # reset state self.state, self.board = self.env_simul.reset(state) done = False n_selection = 0 n_expansion = 0 while not done: board_fill = self.board[CURRENT] + self.board[OPPONENT] self.legal_move = np.argwhere(board_fill == 0).flatten() self.no_legal_move = np.argwhere(board_fill != 0).flatten() key = hash(self.state.tostring()) # search my tree if key in self.tree: # selection action = self._selection(key, c_ucb=1) self.action_memory.appendleft(action) self.key_memory.appendleft(key) n_selection += 1 elif n_expansion == 0: # expansion action = self._expansion(key) self.action_memory.appendleft(action) self.key_memory.appendleft(key) n_expansion += 1 else: # rollout action = random.choice(self.legal_move) self.state, self.board, reward, done = \ self.env_simul.step(action) if done: # backup & reset memory self._backup(reward, n_selection + n_expansion) self._reset() finish = time.time() - start # if finish >= self.think_time: # break print('\n{} simulation end ({:0.0f}s)'.format(sim, finish)) def _selection(self, key, c_ucb): edges = self.tree[key] # get ucb ucb = self._ucb(edges, c_ucb) self.ucb = ucb if self.board[COLOR][0] == WHITE: # black's choice action = np.argwhere(ucb == ucb.max()).flatten() else: # white's choice action = np.argwhere(ucb == ucb.min()).flatten() action = action[random.choice(len(action))] return action def _expansion(self, key): # only select once for rollout action = self._selection(key, c_ucb=1) return action def _ucb(self, edges, c_ucb): total_N = 0 ucb = np.zeros((self.board_size**2), 'float') for i in range(self.board_size**2): total_N += edges[i][N] # black's ucb if self.board[COLOR][0] == WHITE: for move in self.legal_move: if edges[move][N] != 0: ucb[move] = edges[move][Q] + c_ucb * \ np.sqrt(2 * np.log(total_N) / edges[move][N]) else: ucb[move] = np.inf for move in self.no_legal_move: ucb[move] = -np.inf # white's ucb else: for move in self.legal_move: if edges[move][N] != 0: ucb[move] = edges[move][Q] - c_ucb * \ np.sqrt(2 * np.log(total_N) / edges[move][N]) else: ucb[move] = -np.inf for move in self.no_legal_move: ucb[move] = np.inf return ucb def _backup(self, reward, steps): # steps = n_selection + n_expansion # update edges in my tree for i in range(steps): edges = self.tree[self.key_memory[i]] action = self.action_memory[i] edges[action][N] += 1 edges[action][Q] += (reward - edges[action][Q]) / edges[action][N]
class MCTS: def __init__(self, board_size, n_history, n_simul): self.env_simul = OmokEnv(board_size, n_history, display=False) self.board_size = board_size self.n_simul = n_simul self.tree = None self.root = None self.state = None self.board = None # used for backup self.key_memory = deque() self.action_memory = deque() self.reset_tree() def reset_tree(self): self.tree = defaultdict(lambda: zeros((self.board_size**2, 2))) def get_action(self, state, board): self.root = state.copy() self._simulation(state) # init root board after simulatons self.board = board # root state's key root_key = hash(self.root.tostring()) # argmax Q or argmin Q action = self._selection(root_key, c_pucb=0) return action def _simulation(self, state): start = time.time() finish = 0 for sim in range(self.n_simul): print('\rsimulation: {}'.format(sim + 1), end='') sys.stdout.flush() # reset state self.state, self.board = self.env_simul.reset(state) done = False is_expansion = True while not done: key = hash(self.state.tostring()) # search my tree if key in self.tree: # selection action = self._selection(key, c_pucb=5) self.action_memory.appendleft(action) self.key_memory.appendleft(key) else: # expansion legal_move, _ = self._get_legal_move(self.board) action = random.choice(legal_move) if is_expansion: self.action_memory.appendleft(action) self.key_memory.appendleft(key) is_expansion = False self.state, self.board, reward, done = self.env_simul.step( action) if done: # backup & reset memory self._backup(reward) finish = time.time() - start # if finish >= self.think_time: # break print('\r{} simulations end ({:0.0f}s)'.format(sim + 1, finish)) def _get_legal_move(self, board): board_fill = board[CURRENT] + board[OPPONENT] legal_move = argwhere(board_fill != 1).flatten() return legal_move, board_fill def _selection(self, key, c_pucb): edges = self.tree[key] pucb = self._get_pucb(edges, c_pucb) if c_pucb == 0: visit = edges[:, N] print('\nvisit count') print(visit.reshape(self.board_size, self.board_size).round()) action = argwhere(visit == visit.max()).flatten() action = action[random.choice(len(action))] return action if self.board[COLOR][0] == WHITE: # black's choice action = argwhere(pucb == pucb.max()).flatten() else: # white's choice action = argwhere(pucb == pucb.min()).flatten() action = action[random.choice(len(action))] return action def _get_pucb(self, edges, c_pucb): legal_move, no_legal_loc = self._get_legal_move(self.board) prior = 1 / len(legal_move) total_N = edges.sum(0)[N] # black's pucb if self.board[COLOR][0] == WHITE: no_legal_loc *= -99999999 pucb = edges[:, Q] + \ c_pucb * prior * sqrt(total_N) / \ (edges[:, N] + 1) + no_legal_loc # white's pucb else: no_legal_loc *= 99999999 pucb = edges[:, Q] - \ c_pucb * prior * sqrt(total_N) / \ (edges[:, N] + 1) + no_legal_loc return pucb def _backup(self, reward): # update edges in my tree while self.action_memory: key = self.key_memory.popleft() action = self.action_memory.popleft() edges = self.tree[key] edges[action][N] += 1 edges[action][Q] += (reward - edges[action][Q]) / edges[action][N] return 0
class MCTS: def __init__(self, n_block, channel, board_size, n_history, n_simul, mode): self.env_simul = OmokEnv(board_size, n_history, display=False) self.model = PVNet(n_block, n_history * 2 + 1, channel, board_size) self.board_size = board_size self.n_simul = n_simul self.mode = mode self.alpha = 10 / board_size**2 self.tree = None self.root = None self.root_key = None self.state = None self.board = None # used for backup self.key_memory = deque() self.action_memory = deque() self.reset_tree() def reset_tree(self): self.tree = defaultdict(lambda: np.zeros((self.board_size**2, 3))) def get_action(self, state, board, tau): self.root = state.copy() self.root_key = hash(self.root.tostring()) self._simulation(state) # init root board after simulatons self.board = board # argmax visit count action, pi = self._selection(self.root_key, c_pucb=0) if tau == 1: action = np.random.choice(self.board_size**2, p=pi) return action, pi def _simulation(self, state): start = time.time() finish = 0 for sim in range(self.n_simul): print('\rsimulation: {}'.format(sim + 1), end='') sys.stdout.flush() # reset state self.state, self.board = self.env_simul.reset(state) done = False while not done: key = hash(self.state.tostring()) # search my tree if key in self.tree: # selection action = self._selection(key, c_pucb=5) self.action_memory.appendleft(action) self.key_memory.appendleft(key) else: # expansion reward, done = self._expansion(key, self.state) break self.state, self.board, reward, done = self.env_simul.step( action) if done: # backup & reset memory self._backup(reward) finish = time.time() - start # if finish >= self.think_time: # break print('\r{} simulations end ({:0.0f}s)'.format(sim + 1, finish)) def _selection(self, key, c_pucb): edges = self.tree[key] pucb = self._get_pucb(edges, key, c_pucb) if c_pucb == 0: visit = edges[:, N] print('\nvisit count') print(visit.reshape(self.board_size, self.board_size).round()) action = np.argwhere(visit == visit.max()).flatten() action = action[np.random.choice(len(action))] # pi = np.exp(visit) / np.exp(visit).sum() pi = visit / visit.sum() print('\npi') print(pi.reshape( self.board_size, self.board_size).round(decimals=2)) return action, pi if self.board[COLOR][0] == WHITE: # black's choice action = np.argwhere(pucb == pucb.max()).flatten() else: # white's choice action = np.argwhere(pucb == pucb.min()).flatten() action = action[np.random.choice(len(action))] return action def _expansion(self, key, state): edges = self.tree[key] state_input = Variable( TENSOR([state.reshape( HISTORY * 2 + 1, self.board_size, self.board_size)])) prior, value = self.model(state_input) prior = prior.exp() / prior.exp().sum() edges[:, P] = prior.data.cpu().numpy()[0] done = True return value.data.cpu().numpy()[0], done def _backup(self, reward): # update edges in my tree while self.key_memory: key = self.key_memory.popleft() edges = self.tree[key] if self.action_memory: action = self.action_memory.popleft() edges[action][N] += 1 edges[action][Q] += (reward - edges[action][Q]) / \ edges[action][N] return 0 def _get_no_legal_loc(self, board): board_fill = board[CURRENT] + board[OPPONENT] legal_action = np.argwhere(board_fill == 0).flatten() return board_fill, legal_action def _get_pucb(self, edges, key, c_pucb): no_legal_loc, legal_action = self._get_no_legal_loc(self.board) prob = edges[:, P] if key == self.root_key and self.mode == 'learn': noise = np.random.dirichlet( self.alpha * np.ones(len(legal_action))) for i, action in enumerate(legal_action): prob[action] = 0.75 * prob[action] + 0.25 * noise[i] total_N = edges.sum(0)[N] # black's pucb if self.board[COLOR][0] == WHITE: no_legal_loc *= -99999999 pucb = edges[:, Q] + \ c_pucb * prob * np.sqrt(total_N) / (edges[:, N] + 1) + \ no_legal_loc # white's pucb else: no_legal_loc *= 99999999 pucb = edges[:, Q] - \ c_pucb * prob * np.sqrt(total_N) / (edges[:, N] + 1) + \ no_legal_loc return pucb