def get_pv(self, root_id): state = utils.get_state_pt(root_id, self.board_size, self.inplanes) self.model.eval() with torch.no_grad(): state_input = torch.tensor([state]).to(device).float() policy, value = self.model(state_input) p = policy.data.cpu().numpy()[0] v = value.data.cpu().numpy()[0] return p, v
def _expansion_evaluation(self, leaf_id, win_index): leaf_state = utils.get_state_pt(leaf_id, self.board_size, self.inplanes) self.model.eval() with torch.no_grad(): state_input = torch.tensor([leaf_state]).to(device).float() policy, value = self.model(state_input) policy = policy.cpu().numpy()[0] value = value.cpu().numpy()[0] if win_index == 0: # expansion actions = utils.legal_actions(leaf_id, self.board_size) prior_prob = np.zeros(self.board_size**2) # re-nomalization for action_index in actions: prior_prob[action_index] = policy[action_index] prior_prob /= prior_prob.sum() if self.noise: # root node noise if leaf_id == self.root_id: noise_probs = np.random.dirichlet(self.alpha * np.ones(len(actions))) for i, action_index in enumerate(actions): child_id = leaf_id + (action_index, ) prior_p = prior_prob[action_index] if self.noise: if leaf_id == self.root_id: prior_p = 0.75 * prior_p + 0.25 * noise_probs[i] self.tree[child_id] = { 'child': [], 'n': 0., 'w': 0., 'q': 0., 'p': prior_p } self.tree[leaf_id]['child'].append(action_index) # return value reward = False return value, reward else: # terminal node # return reward reward = 1. value = False return value, reward
def get_pv(self, root_id): # state # s[t = (0, 1, ... , inplanes - 2)] = (inplanes - 2 - t)턴 전에 착수한 플레이어의 모든 돌 위치가 one hot 인코딩된 array # s[inplanes-1] = color feature (마지막 착수가 흑이었으면 all 0, 백이면 all 1) state = utils.get_state_pt(root_id, self.board_size, self.inplanes) self.model.eval() # 드롭아웃 및 배치 정규화를 평가 모드로 설정 with torch.no_grad(): # Tensor로 부터의 기록 추적과 메모리 사용 방지 state_input = torch.tensor( [state]).to(device).float() # 지정한 디바이스에 새로운 Tensor 인스턴스 생성 policy, value = self.model(state_input) # 모델에 Tensor 적용 p = policy.data.cpu().numpy()[ 0] # policy : 각 착수 위치의 승리 가능성이 높을수록 높게 책정된다 v = value.data.cpu().numpy()[ 0] # value : (-1 ~ 1) 마지막 턴 플레이어의 승리 가능성이 높으면 낮은 값을 반환 return p, v
def self_play(n_selfplay): global cur_memory, rep_memory global Agent state_black = deque() state_white = deque() pi_black = deque() pi_white = deque() if RESIGN_MODE: resign_val_balck = [] resign_val_white = [] resign_val = [] resign_v = -1.0 n_resign_thres = N_SELFPLAY // 4 for episode in range(n_selfplay): if (episode + 1) % 10 == 0: logging.warning('Playing Episode {:3}'.format(episode + 1)) env = game.GameState('text') board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float') turn = 0 root_id = (0, ) win_index = 0 time_steps = 0 action_index = None if RESIGN_MODE: resign_index = 0 while win_index == 0: if PRINT_SELFPLAY: utils.render_str(board, BOARD_SIZE, action_index) # ====================== start MCTS ============================ # if time_steps < TAU_THRES: tau = 1 else: tau = 0 pi = Agent.get_pi(root_id, tau) # ===================== collect samples ======================== # state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES) if turn == 0: state_black.appendleft(state) pi_black.appendleft(pi) else: state_white.appendleft(state) pi_white.appendleft(pi) # ======================== get action ========================== # action, action_index = utils.get_action(pi) root_id += (action_index, ) # ====================== print evaluation ====================== # if PRINT_SELFPLAY: Agent.model.eval() with torch.no_grad(): state_input = torch.tensor([state]).to(device).float() p, v = Agent.model(state_input) p = p.cpu().numpy()[0] v = v.item() print('\nPi:\n{}'.format( pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) print('\nPolicy:\n{}'.format( p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) if turn == 0: print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100)) if RESIGN_MODE: if episode < n_resign_thres: resign_val_balck.append(v) elif v < resign_v: resign_index = 2 if PRINT_SELFPLAY: print('"Black Resign!"') else: print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100)) if RESIGN_MODE: if episode < n_resign_thres: resign_val_white.append(v) elif v < resign_v: resign_index = 1 if PRINT_SELFPLAY: print('"White Resign!"') # =========================== step ============================= # board, _, win_index, turn, _ = env.step(action) time_steps += 1 # ========================== result ============================ # if RESIGN_MODE: if resign_index != 0: win_index = resign_index result['Resign'] += 1 if win_index != 0: if win_index == 1: reward_black = 1. reward_white = -1. result['Black'] += 1 if RESIGN_MODE: if episode < n_resign_thres: for val in resign_val_balck: resign_val.append(val) resign_val_balck.clear() resign_val_white.clear() elif win_index == 2: reward_black = -1. reward_white = 1. result['White'] += 1 if RESIGN_MODE: if episode < n_resign_thres: for val in resign_val_white: resign_val.append(val) resign_val_white.clear() resign_val_balck.clear() else: reward_black = 0. reward_white = 0. result['Draw'] += 1 if RESIGN_MODE: if episode < n_resign_thres: for val in resign_val_balck: resign_val.append(val) for val in resign_val_white: resign_val.append(val) resign_val_balck.clear() resign_val_white.clear() if RESIGN_MODE: if episode + 1 == n_resign_thres: resign_v = min(resign_val) resign_val.clear() if PRINT_SELFPLAY: print('Resign win%: {:.2f}%'.format( (resign_v + 1) / 2 * 100)) # ====================== store in memory ======================= # while state_black or state_white: if state_black: cur_memory.append( (state_black.pop(), pi_black.pop(), reward_black)) if state_white: cur_memory.append( (state_white.pop(), pi_white.pop(), reward_white)) # ========================= result =========================== # if PRINT_SELFPLAY: utils.render_str(board, BOARD_SIZE, action_index) bw, ww, dr, rs = result['Black'], result['White'], \ result['Draw'], result['Resign'] print('') print('=' * 20, " {:3} Game End ".format(episode + 1), '=' * 20) print('Black Win: {:3} ' 'White Win: {:3} ' 'Draw: {:2} ' 'Win%: {:.2f}%' '\nResign: {:2}'.format(bw, ww, dr, (bw + 0.5 * dr) / (bw + ww + dr) * 100, rs)) print('current memory size:', len(cur_memory)) Agent.reset() rep_memory.extend(utils.augment_dataset(cur_memory, BOARD_SIZE))
def self_play(agent, cur_memory, rank=0): agent.model.eval() state_black = deque() state_white = deque() pi_black = deque() pi_white = deque() episode = 0 while True: if (episode + 1) % 10 == 0: logging.info('Playing Episode {:3}'.format(episode + 1)) env = game.GameState('text') board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float') turn = 0 root_id = (0, ) win_index = 0 time_steps = 0 action_index = None while win_index == 0: if PRINT_SELFPLAY and rank == 0: utils.render_str(board, BOARD_SIZE, action_index) # ====================== start MCTS ============================ # if time_steps < TAU_THRES: tau = 1 else: tau = 0 pi = agent.get_pi(root_id, tau, rank) # ===================== collect samples ======================== # state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES) if turn == 0: state_black.appendleft(state) pi_black.appendleft(pi) else: state_white.appendleft(state) pi_white.appendleft(pi) # ======================== get action ========================== # action, action_index = utils.get_action(pi) root_id += (action_index, ) # ====================== print evaluation ====================== # if PRINT_SELFPLAY and rank == 0: with torch.no_grad(): state_input = torch.tensor([state]).to(device).float() p, v = agent.model(state_input) p = p.cpu().numpy()[0] v = v.item() print('\nPi:\n{}'.format( pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) print('\nPolicy:\n{}'.format( p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2))) if turn == 0: print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100)) else: print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100)) # =========================== step ============================= # board, _, win_index, turn, _ = env.step(action) time_steps += 1 # ========================== result ============================ # if win_index != 0: if win_index == 1: reward_black = 1. reward_white = -1. result['Black'] += 1 elif win_index == 2: reward_black = -1. reward_white = 1. result['White'] += 1 else: reward_black = 0. reward_white = 0. result['Draw'] += 1 # ====================== store in memory ======================= # while state_black or state_white: if state_black: cur_memory.append( (state_black.pop(), pi_black.pop(), reward_black)) if state_white: cur_memory.append( (state_white.pop(), pi_white.pop(), reward_white)) # ========================= result =========================== # if PRINT_SELFPLAY and rank == 0: utils.render_str(board, BOARD_SIZE, action_index) bw, ww, dr = result['Black'], result['White'], \ result['Draw'] print('') print('=' * 20, " {:3} Game End ".format(episode + 1), '=' * 20) print('Black Win: {:3} ' 'White Win: {:3} ' 'Draw: {:2} ' 'Win%: {:.2f}%'.format(bw, ww, dr, (bw + 0.5 * dr) / (bw + ww + dr) * 100)) print('current memory size:', len(cur_memory)) episode += 1 agent.reset() if len(cur_memory) >= MEMORY_SIZE: return utils.augment_dataset(cur_memory, BOARD_SIZE)
def _expansion_evaluation(self, leaf_id, win_index): # 최근 몇턴간의 one hot 인코딩된 흑돌의 위치와 one hot 인코딩된 백돌의 위치, 그리고 색깔 정보 leaf_state = utils.get_state_pt(leaf_id, self.board_size, self.inplanes) self.model.eval() # 드롭아웃 및 배치 정규화를 평가 모드로 설정 with torch.no_grad(): # Tensor로 부터의 기록 추적과 메모리 사용 방지 state_input = torch.tensor([ leaf_state ]).to(device).float() # 지정한 디바이스에 새로운 Tensor 인스턴스 생성 policy, value = self.model(state_input) # 모델에 Tensor 적용 policy = policy.cpu().numpy()[0] # policy : 승리가능성이 높을수록 높게 책정된다 value = value.cpu().numpy()[ 0] # value : (-1 ~ 1) 마지막 턴 플레이어의 승리 가능성이 높으면 낮은 값을 반환 if win_index == 0: # 승패가 결정되지 않은 경우 # expansion actions = utils.legal_actions( leaf_id, self.board_size) # 잎 노드의 보드 상황에서 모든 가능한 착수 위치 prior_prob = np.zeros( self.board_size**2) # policy의 정규화 값이 저장될 array # re-nomalization for action_index in actions: prior_prob[action_index] = policy[action_index] prior_prob /= prior_prob.sum() if self.noise: # 노이즈 생성 # root node noise if leaf_id == self.root_id: noise_probs = np.random.dirichlet(self.alpha * np.ones(len(actions))) # 잎 노드에서 착수 가능한 위치에 해당하는 자식 노드 생성 for i, action_index in enumerate(actions): child_id = leaf_id + (action_index, ) prior_p = prior_prob[action_index] if self.noise: if leaf_id == self.root_id: prior_p = 0.75 * prior_p + 0.25 * noise_probs[i] # 트리에 자식 노드 추가 self.tree[child_id] = { 'child': [], 'n': 0., 'w': 0., 'q': 0., 'p': prior_p } self.tree[leaf_id]['child'].append( action_index) # 잎 노드의 child 밸류 수정 # return value reward = False return value, reward else: # 게임의 승패가 결정됐을 때 # terminal node # return reward reward = 1. value = False return value, reward