def select_action_q_and_u(self, env, is_root_node): key = self.counter_key(env) if env.next_player == Player.black: legal_moves = find_correct_moves(key.black, key.white) else: legal_moves = find_correct_moves(key.white, key.black) # noinspection PyUnresolvedReferences xx_ = np.sqrt(np.sum(self.var_n[key])) # SQRT of sum(N(s, b); for all b) xx_ = max(xx_, 1) # avoid u_=0 if N is all 0 p_ = self.var_p[key] # re-normalize in legal moves p_ = p_ * bit_to_array(legal_moves, 225) if np.sum(p_) > 0: # decay policy gradually in the end phase _pc = self.config.play temperature = min(np.exp(1-np.power(env.turn/_pc.policy_decay_turn, _pc.policy_decay_power)), 1) # normalize and decay policy p_ = self.normalize(p_, temperature) if is_root_node and self.play_config.noise_eps > 0: # Is it correct?? -> (1-e)p + e*Dir(alpha) noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha) p_ = (1 - self.play_config.noise_eps) * p_ + self.play_config.noise_eps * noise u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key]) if env.next_player == Player.black: v_ = (self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225) else: # When enemy's selecting action, flip Q-Value. v_ = (-self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225) # noinspection PyTypeChecker action_t = int(np.argmax(v_)) return action_t
def select_action_q_and_u(self, env, is_root_node): key = self.counter_key(env) if env.next_player == Player.black: legal_moves = find_correct_moves(key.black, key.white) else: legal_moves = find_correct_moves(key.white, key.black) # noinspection PyUnresolvedReferences xx_ = np.sqrt(np.sum( self.var_n[key])) # SQRT of sum(N(s, b); for all b) xx_ = max(xx_, 1) # avoid u_=0 if N is all 0 p_ = self.var_p[key] if is_root_node: # Is it correct?? -> (1-e)p + e*Dir(alpha) p_ = (1 - self.play_config.noise_eps) * p_ + \ self.play_config.noise_eps * np.random.dirichlet([self.play_config.dirichlet_alpha] * 64) # re-normalize in legal moves p_ = p_ * bit_to_array(legal_moves, 64) if np.sum(p_) > 0: p_ = p_ / np.sum(p_) u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key]) if env.next_player == Player.black: v_ = (self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64) else: # When enemy's selecting action, flip Q-Value. v_ = (-self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64) # noinspection PyTypeChecker action_t = int(np.argmax(v_)) return action_t
def observation(self): ob = np.ndarray([2 * self._board_history.maxlen, 8, 8]) if self.next_player == Player.black: for i, b in enumerate(reversed(self._board_history)): ob[2 * i] = bit_to_array(b.black, 64).reshape(8, 8) ob[2 * i + 1] = bit_to_array(b.white, 64).reshape(8, 8) else: for i, b in enumerate(reversed(self._board_history)): ob[2 * i] = bit_to_array(b.white, 64).reshape(8, 8) ob[2 * i + 1] = bit_to_array(b.black, 64).reshape(8, 8) return ob
async def expand_and_evaluate(self, env): """expand new leaf update var_p, return leaf_v :param ReversiEnv env: :return: leaf_v """ key = self.counter_key(env) another_side_key = self.another_side_counter_key(env) self.now_expanding.add(key) black, white = env.board.black, env.board.white # (di(p), v) = fθ(di(sL)) # rotation and flip. flip -> rot. is_flip_vertical = random() < 0.5 rotate_right_num = int(random() * 4) if is_flip_vertical: black, white = flip_vertical(black), flip_vertical(white) for i in range(rotate_right_num): black, white = rotate90(black), rotate90( white) # rotate90: rotate bitboard RIGHT 1 time black_ary = bit_to_array(black, 64).reshape((8, 8)) white_ary = bit_to_array(white, 64).reshape((8, 8)) state = [ black_ary, white_ary ] if env.next_player == Player.black else [white_ary, black_ary] future = await self.predict(np.array(state)) # type: Future await future leaf_p, leaf_v = future.result() # reverse rotate and flip about leaf_p if rotate_right_num > 0 or is_flip_vertical: # reverse rotation and flip. rot -> flip. leaf_p = leaf_p.reshape((8, 8)) if rotate_right_num > 0: leaf_p = np.rot90( leaf_p, k=rotate_right_num) # rot90: rotate matrix LEFT k times if is_flip_vertical: leaf_p = np.flipud(leaf_p) leaf_p = leaf_p.reshape((64, )) self.var_p[key] = leaf_p # P is value for next_player (black or white) self.var_p[another_side_key] = leaf_p self.expanded.add(key) self.now_expanding.remove(key) return float(leaf_v)
def bypass_first_move(self, key): legal_array = bit_to_array(find_correct_moves(key.black, key.white), 64) action = np.argmax(legal_array) self.var_n[key][action] = 1 self.var_w[key][action] = 0 self.var_p[key] = legal_array / np.sum(legal_array)
def convert_to_training_data(data): """ :param data: format is SelfPlayWorker.buffer list of [(own: bitboard, enemy: bitboard), [policy: float 64 items], z: number] :return: """ state_list = [] policy_list = [] z_list = [] for state, policy, z in data: own, enemy = bit_to_array(state[0], 64).reshape((8, 8)), bit_to_array(state[1], 64).reshape((8, 8)) state_list.append([own, enemy]) policy_list.append(policy) z_list.append(z) return np.array(state_list), np.array(policy_list), np.array(z_list)
def test_dirichlet_noise_of_mask(): legal_moves = 47289423 bc = bit_count(legal_moves) noise = dirichlet_noise_of_mask(legal_moves, 0.5) assert_almost_equal(1, np.sum(noise)) eq_(bc, np.sum(noise > 0)) ary = bit_to_array(legal_moves, 64) eq_(list(noise), list(noise * ary))
def is_game_over(own, action): my_board = bit_to_array(own, 225).reshape(15, 15) x = action // 15 y = action % 15 limit = 5 count = 0 for i in range(15): if my_board[x][i] == 1: count += 1 if count == limit: return True count = 0 for i in range(15): if my_board[i][y] == 1: count += 1 if count == limit: return True count = 1 for i in range(1, 15): xx = x + i yy = y + i if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1: count += 1 for i in range(1, 15): xx = x - i yy = y - i if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1: count += 1 if count == limit: return True count = 1 for i in range(1, 15): xx = x + i yy = y - i if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1: count += 1 for i in range(1, 15): xx = x - i yy = y + i if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1: count += 1 if count == limit: return True return False
def legal_moves(self): own, enemy = self.get_own_and_enemy() bit = find_correct_moves(own, enemy) array = bit_to_array(bit, 64) # array = np.append(array, 1 if bit == 0 else 0) # if no correct move, then you can pass return array