def select_action_q_and_u(self, env, is_root_node): key = self.counter_key(env) if env.next_player == Player.black: legal_moves = find_correct_moves(key.black, key.white) else: legal_moves = find_correct_moves(key.white, key.black) # noinspection PyUnresolvedReferences xx_ = np.sqrt(np.sum(self.var_n[key])) # SQRT of sum(N(s, b); for all b) xx_ = max(xx_, 1) # avoid u_=0 if N is all 0 p_ = self.var_p[key] if is_root_node and self.play_config.noise_eps > 0: # Is it correct?? -> (1-e)p + e*Dir(alpha) if self.play_config.dirichlet_noise_only_for_legal_moves: noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha) else: noise = np.random.dirichlet([self.play_config.dirichlet_alpha] * 64) p_ = (1 - self.play_config.noise_eps) * p_ + self.play_config.noise_eps * noise # re-normalize in legal moves p_ = p_ * bit_to_array(legal_moves, 64) if np.sum(p_) > 0: p_ = p_ / np.sum(p_) u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key]) if env.next_player == Player.black: v_ = (self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64) else: # When enemy's selecting action, flip Q-Value. v_ = (-self.var_q[key] + u_ + 1000) * bit_to_array(legal_moves, 64) # noinspection PyTypeChecker action_t = int(np.argmax(v_)) return action_t
def select_action_q_and_u(self, env, is_root_node): key = self.counter_key(env) if env.next_player == Player.black: legal_moves = find_correct_moves(key.black, key.white) else: legal_moves = find_correct_moves(key.white, key.black) # noinspection PyUnresolvedReferences xx_ = np.sqrt(np.sum(self.var_n[key])) # SQRT of sum(N(s, b); for all b) xx_ = max(xx_, 1) # avoid u_=0 if N is all 0 p_ = self.var_p[key] # re-normalize in legal moves p_ = p_ * bit_to_array(legal_moves, 225) if np.sum(p_) > 0: # decay policy gradually in the end phase _pc = self.config.play temperature = min(np.exp(1-np.power(env.turn/_pc.policy_decay_turn, _pc.policy_decay_power)), 1) # normalize and decay policy p_ = self.normalize(p_, temperature) if is_root_node and self.play_config.noise_eps > 0: # Is it correct?? -> (1-e)p + e*Dir(alpha) noise = dirichlet_noise_of_mask(legal_moves, self.play_config.dirichlet_alpha) p_ = (1 - self.play_config.noise_eps) * p_ + self.play_config.noise_eps * noise u_ = self.play_config.c_puct * p_ * xx_ / (1 + self.var_n[key]) if env.next_player == Player.black: v_ = (self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225) else: # When enemy's selecting action, flip Q-Value. v_ = (-self.var_q(key) + u_ + 1000) * bit_to_array(legal_moves, 225) # noinspection PyTypeChecker action_t = int(np.argmax(v_)) return action_t
def test_dirichlet_noise_of_mask(): legal_moves = 47289423 bc = bit_count(legal_moves) noise = dirichlet_noise_of_mask(legal_moves, 0.5) assert_almost_equal(1, np.sum(noise)) eq_(bc, np.sum(noise > 0)) ary = bit_to_array(legal_moves, 64) eq_(list(noise), list(noise * ary))