Exemple #1
0
    def select_action_q_and_u(self, env, is_root_node):
        key = self.counter_key(env)
        if env.next_player == Player.black:
            legal_moves = find_correct_moves(key.black, key.white)
        else:
            legal_moves = find_correct_moves(key.white, key.black)
        # noinspection PyUnresolvedReferences
        xx_ = np.sqrt(np.sum(self.var_n[key]))  # SQRT of sum(N(s, b); for all b)
        xx_ = max(xx_, 1)  # avoid u_=0 if N is all 0
        p_ = self.var_p[key]

        # re-normalize in legal moves
        p_ = p_ * bit_to_array(legal_moves, 225)
        if np.sum(p_) > 0:
            # decay policy gradually in the end phase
            _pc = self.config.play
            temperature = min(np.exp(1-np.power(env.turn/_pc.policy_decay_turn, _pc.policy_decay_power)), 1)
            # normalize and decay policy
            p_ = self.normalize(p_, temperature)

        if is_root_node and self.play_config.noise_eps > 0:  # Is it correct?? -> (1-e)p + e*Dir(alpha)
            noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha)
            p_ = (1 - self.play_config.noise_eps) * p_ + self.play_config.noise_eps * noise

        u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key])
        if env.next_player == Player.black:
            v_ = (self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225)
        else:
            # When enemy's selecting action, flip Q-Value.
            v_ = (-self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225)

        # noinspection PyTypeChecker
        action_t = int(np.argmax(v_))
        return action_t
    def select_action_q_and_u(self, env, is_root_node):
        key = self.counter_key(env)
        if env.next_player == Player.black:
            legal_moves = find_correct_moves(key.black, key.white)
        else:
            legal_moves = find_correct_moves(key.white, key.black)
        # noinspection PyUnresolvedReferences
        xx_ = np.sqrt(np.sum(
            self.var_n[key]))  # SQRT of sum(N(s, b); for all b)
        xx_ = max(xx_, 1)  # avoid u_=0 if N is all 0
        p_ = self.var_p[key]

        if is_root_node:  # Is it correct?? -> (1-e)p + e*Dir(alpha)
            p_ = (1 - self.play_config.noise_eps) * p_ + \
                 self.play_config.noise_eps * np.random.dirichlet([self.play_config.dirichlet_alpha] * 64)

        # re-normalize in legal moves
        p_ = p_ * bit_to_array(legal_moves, 64)
        if np.sum(p_) > 0:
            p_ = p_ / np.sum(p_)

        u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key])
        if env.next_player == Player.black:
            v_ = (self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64)
        else:
            # When enemy's selecting action, flip Q-Value.
            v_ = (-self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64)

        # noinspection PyTypeChecker
        action_t = int(np.argmax(v_))
        return action_t
Exemple #3
0
    def observation(self):
        ob = np.ndarray([2 * self._board_history.maxlen, 8, 8])
        if self.next_player == Player.black:
            for i, b in enumerate(reversed(self._board_history)):
                ob[2 * i] = bit_to_array(b.black, 64).reshape(8, 8)
                ob[2 * i + 1] = bit_to_array(b.white, 64).reshape(8, 8)
        else:
            for i, b in enumerate(reversed(self._board_history)):
                ob[2 * i] = bit_to_array(b.white, 64).reshape(8, 8)
                ob[2 * i + 1] = bit_to_array(b.black, 64).reshape(8, 8)

        return ob
Exemple #4
0
    async def expand_and_evaluate(self, env):
        """expand new leaf

        update var_p, return leaf_v

        :param ReversiEnv env:
        :return: leaf_v
        """

        key = self.counter_key(env)
        another_side_key = self.another_side_counter_key(env)
        self.now_expanding.add(key)

        black, white = env.board.black, env.board.white

        # (di(p), v) = fθ(di(sL))
        # rotation and flip. flip -> rot.
        is_flip_vertical = random() < 0.5
        rotate_right_num = int(random() * 4)
        if is_flip_vertical:
            black, white = flip_vertical(black), flip_vertical(white)
        for i in range(rotate_right_num):
            black, white = rotate90(black), rotate90(
                white)  # rotate90: rotate bitboard RIGHT 1 time

        black_ary = bit_to_array(black, 64).reshape((8, 8))
        white_ary = bit_to_array(white, 64).reshape((8, 8))
        state = [
            black_ary, white_ary
        ] if env.next_player == Player.black else [white_ary, black_ary]
        future = await self.predict(np.array(state))  # type: Future
        await future
        leaf_p, leaf_v = future.result()

        # reverse rotate and flip about leaf_p
        if rotate_right_num > 0 or is_flip_vertical:  # reverse rotation and flip. rot -> flip.
            leaf_p = leaf_p.reshape((8, 8))
            if rotate_right_num > 0:
                leaf_p = np.rot90(
                    leaf_p,
                    k=rotate_right_num)  # rot90: rotate matrix LEFT k times
            if is_flip_vertical:
                leaf_p = np.flipud(leaf_p)
            leaf_p = leaf_p.reshape((64, ))

        self.var_p[key] = leaf_p  # P is value for next_player (black or white)
        self.var_p[another_side_key] = leaf_p
        self.expanded.add(key)
        self.now_expanding.remove(key)
        return float(leaf_v)
Exemple #5
0
 def bypass_first_move(self, key):
     legal_array = bit_to_array(find_correct_moves(key.black, key.white),
                                64)
     action = np.argmax(legal_array)
     self.var_n[key][action] = 1
     self.var_w[key][action] = 0
     self.var_p[key] = legal_array / np.sum(legal_array)
    def convert_to_training_data(data):
        """

        :param data: format is SelfPlayWorker.buffer
            list of [(own: bitboard, enemy: bitboard), [policy: float 64 items], z: number]
        :return:
        """
        state_list = []
        policy_list = []
        z_list = []
        for state, policy, z in data:
            own, enemy = bit_to_array(state[0], 64).reshape((8, 8)), bit_to_array(state[1], 64).reshape((8, 8))
            state_list.append([own, enemy])
            policy_list.append(policy)
            z_list.append(z)

        return np.array(state_list), np.array(policy_list), np.array(z_list)
def test_dirichlet_noise_of_mask():
    legal_moves = 47289423
    bc = bit_count(legal_moves)
    noise = dirichlet_noise_of_mask(legal_moves, 0.5)
    assert_almost_equal(1, np.sum(noise))
    eq_(bc, np.sum(noise > 0))
    ary = bit_to_array(legal_moves, 64)
    eq_(list(noise), list(noise * ary))
Exemple #8
0
def is_game_over(own, action):
    my_board = bit_to_array(own, 225).reshape(15, 15)
    x = action // 15
    y = action % 15
    limit = 5
    count = 0
    for i in range(15):
        if my_board[x][i] == 1:
            count += 1
    if count == limit:
        return True
    count = 0
    for i in range(15):
        if my_board[i][y] == 1:
            count += 1
    if count == limit:
        return True

    count = 1
    for i in range(1, 15):
        xx = x + i
        yy = y + i
        if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1:
            count += 1
    for i in range(1, 15):
        xx = x - i
        yy = y - i
        if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1:
            count += 1
    if count == limit:
        return True

    count = 1
    for i in range(1, 15):
        xx = x + i
        yy = y - i
        if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1:
            count += 1
    for i in range(1, 15):
        xx = x - i
        yy = y + i
        if 0 <= xx < 15 and 0 <= yy < 15 and my_board[xx][yy] == 1:
            count += 1
    if count == limit:
        return True

    return False
Exemple #9
0
 def legal_moves(self):
     own, enemy = self.get_own_and_enemy()
     bit = find_correct_moves(own, enemy)
     array = bit_to_array(bit, 64)
     # array = np.append(array, 1 if bit == 0 else 0)  # if no correct move, then you can pass
     return array