Esempio n. 1
0
    def select_action_q_and_u(self, env, is_root_node):
        key = self.counter_key(env)
        if env.next_player == Player.black:
            legal_moves = find_correct_moves(key.black, key.white)
        else:
            legal_moves = find_correct_moves(key.white, key.black)
        # noinspection PyUnresolvedReferences
        xx_ = np.sqrt(np.sum(self.var_n[key]))  # SQRT of sum(N(s, b); for all b)
        xx_ = max(xx_, 1)  # avoid u_=0 if N is all 0
        p_ = self.var_p[key]

        if is_root_node and self.play_config.noise_eps > 0:  # Is it correct?? -> (1-e)p + e*Dir(alpha)
            if self.play_config.dirichlet_noise_only_for_legal_moves:
                noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha)
            else:
                noise = np.random.dirichlet([self.play_config.dirichlet_alpha] * 64)
            p_ = (1 - self.play_config.noise_eps) * p_ + self.play_config.noise_eps * noise

        # re-normalize in legal moves
        p_ = p_ * bit_to_array(legal_moves, 64)
        if np.sum(p_) > 0:
            p_ = p_ / np.sum(p_)

        u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key])
        if env.next_player == Player.black:
            v_ = (self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64)
        else:
            # When enemy's selecting action, flip Q-Value.
            v_ = (-self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64)

        # noinspection PyTypeChecker
        action_t = int(np.argmax(v_))
        return action_t
Esempio n. 2
0
    def select_action_q_and_u(self, env, is_root_node):
        key = self.counter_key(env)
        if env.next_player == Player.black:
            legal_moves = find_correct_moves(key.black, key.white)
        else:
            legal_moves = find_correct_moves(key.white, key.black)
        # noinspection PyUnresolvedReferences
        xx_ = np.sqrt(np.sum(self.var_n[key]))  # SQRT of sum(N(s, b); for all b)
        xx_ = max(xx_, 1)  # avoid u_=0 if N is all 0
        p_ = self.var_p[key]

        # re-normalize in legal moves
        p_ = p_ * bit_to_array(legal_moves, 225)
        if np.sum(p_) > 0:
            # decay policy gradually in the end phase
            _pc = self.config.play
            temperature = min(np.exp(1-np.power(env.turn/_pc.policy_decay_turn, _pc.policy_decay_power)), 1)
            # normalize and decay policy
            p_ = self.normalize(p_, temperature)

        if is_root_node and self.play_config.noise_eps > 0:  # Is it correct?? -> (1-e)p + e*Dir(alpha)
            noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha)
            p_ = (1 - self.play_config.noise_eps) * p_ + self.play_config.noise_eps * noise

        u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key])
        if env.next_player == Player.black:
            v_ = (self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225)
        else:
            # When enemy's selecting action, flip Q-Value.
            v_ = (-self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225)

        # noinspection PyTypeChecker
        action_t = int(np.argmax(v_))
        return action_t
def test_dirichlet_noise_of_mask():
    legal_moves = 47289423
    bc = bit_count(legal_moves)
    noise = dirichlet_noise_of_mask(legal_moves, 0.5)
    assert_almost_equal(1, np.sum(noise))
    eq_(bc, np.sum(noise > 0))
    ary = bit_to_array(legal_moves, 64)
    eq_(list(noise), list(noise * ary))