def __init__(self, game_state:Quoridor, policy_output, value_output): # _counts is the number of times we've taken some action *from this state*. Initialized to all zeros. Stored # as a torch tensor over all possible actions, to be later masked with the set of legal actions self._counts = torch.zeros(3, 9, 9) self._total_reward = torch.zeros(3, 9, 9) self._policy = policy_output self._value = value_output self._legal_mask = encode_actions_to_planes(game_state.all_legal_moves(), game_state.current_player) self._player = game_state.current_player self._key = game_state.hash_key() self._children = {} self.__flagged = False
col) + "v" if temperature < 1e-6: # Do max operation instead of unstable low-temperature manipulations idx = torch.argmax(policy_planes) else: idx = torch.multinomial(policy_planes.flatten()**temperature, num_samples=1) return _idx_to_action(idx.item()) if __name__ == '__main__': # mini test q = Quoridor() legal_moves = q.all_legal_moves(partial_check=False) print("INITIAL STATE LEGAL MOVES ({} of them):".format(len(legal_moves))) print(legal_moves) for mv in legal_moves: planes = encode_actions_to_planes(mv, q.current_player) print("=========== {} ============".format(mv)) print(planes) mv2 = sample_action(planes, 0) print(mv2) assert mv2 == mv, "Failed to encode/decode {}".format(mv) # Test that just sampling random moves leads to some illegal moves getting selected (this is expected) random_actions, masked_random_actions = [''] * 100, [''] * 100 legal_mask = encode_actions_to_planes(legal_moves, q.current_player) for i in range(100):