Esempio n. 1
0
def test_l1_layer(byte_value: int, num_bytes: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)

    # Manually compute the popcount sums here.
    level_1_size = math.ceil(len(bits) / 2048)
    level_1: List[int] = [0] * level_1_size

    v = memoryview(bits)
    for byte_offset in range(0, len(v), 8):
        level_1_idx = 1 + byte_offset // 256
        if level_1_idx < len(level_1):
            level_1[level_1_idx] += popcount(v[byte_offset:byte_offset + 8])

    for byte_offset in range(0, num_bytes, 1 << 29):
        level_1_idx = byte_offset // 256
        level_1[level_1_idx] = 0

    for i in range(1, len(level_1)):
        level_1[i] += level_1[i - 1]

    poppy = Poppy(bits)
    # Python will literally asplode if we try to use list equality to compare
    # the two lists.
    for i in range(0, len(level_1)):
        assert poppy._level_1[2 * i] == level_1[i], f"Failed at {i}"
Esempio n. 2
0
    def rank(self, i: int) -> int:
        """
        Returns the number of 1 bits up to and including position i.
        https://en.wikipedia.org/wiki/Succinct_data_structure#Succinct_dictionaries
        """
        byte_offset = i // 8

        sum_rank = 0
        level_0_idx = byte_offset // (1 << 29)
        sum_rank += self._level_0[level_0_idx]

        level_1_idx = (byte_offset // 256) * 2
        sum_rank += self._level_1[level_1_idx]

        basic_block_idx = (byte_offset % 256) // 64
        level_2_idx = level_1_idx + 1
        packed_relative_counts = self._level_1[level_2_idx]
        left_block_idx = 0
        while left_block_idx < basic_block_idx:
            sum_rank += self._get_relative_count(
                basic_block_index=left_block_idx,
                packed_relative_counts=packed_relative_counts)
            left_block_idx += 1

        # Now do a manual popcount within the current basic block.
        start_bit = 8 * (byte_offset // 64) * 64
        end_bit = i + 1

        while start_bit + 64 <= end_bit:
            start_byte = start_bit // 8
            sum_rank += popcount(self._memory_view[start_byte:(start_byte +
                                                               8)])
            start_bit += 64

        while start_bit + 8 <= end_bit:
            start_byte = start_bit // 8
            sum_rank += RANK_IN_BYTE[256 * 7 + self._memory_view[start_byte]]
            start_bit += 8

        if start_bit < end_bit:
            slack = end_bit - start_bit - 1
            start_byte = start_bit // 8
            sum_rank += RANK_IN_BYTE[256 * slack +
                                     self._memory_view[start_byte]]

        return sum_rank
Esempio n. 3
0
def test_l0_layer(byte_value: int, num_bytes: int) -> None:
    bits = bitarray()
    bits.frombytes(bytes([byte_value]) * num_bytes)

    # Manually compute the popcount sums here.
    num_popcount_sums = math.ceil(len(bits) / (2**32))
    popcount_sums: List[int] = [0] * num_popcount_sums

    v = memoryview(bits)
    for byte_offset in range(0, len(v), 8):
        popcount_idx = 1 + byte_offset // (2**29)
        if popcount_idx < len(popcount_sums):
            popcount_sums[popcount_idx] += popcount(v[byte_offset:byte_offset +
                                                      8])

    for i in range(1, len(popcount_sums)):
        popcount_sums[i] += popcount_sums[i - 1]

    poppy = Poppy(bits)
    assert list(poppy._level_0) == popcount_sums
Esempio n. 4
0
    def _initialize_select_structure(self) -> "List[array]":
        """
        For each upper block, we precompute the position of every 8192nd one bit
        (relative to the beginning of the upper block). These positions can be
        stored in 32 bits.
        """
        bit_array_byte_length = len(self._memory_view)
        select_structure: "List[array]" = []
        for level_0_idx, level_0_sum in enumerate(self._level_0):
            rank_start = level_0_sum

            rank_end = self.rank(
                min((level_0_idx + 1) * (1 << 32) - 1,
                    len(self._bit_array) - 1))

            num_one_bits = rank_end - rank_start
            num_entries = math.ceil(num_one_bits / 8192)
            select_structure.append(array('L', [0] * num_entries))

            # Now scan through the upper level.
            popcount_sum = 0
            current_select_target = 0
            for byte_offset in range((1 << 29) * level_0_idx,
                                     min(bit_array_byte_length,
                                         (1 << 29) * (level_0_idx + 1)), 8):
                old_sum = popcount_sum
                popcount_sum += popcount(
                    self._memory_view[byte_offset:byte_offset + 8])
                if popcount_sum > current_select_target:
                    select_in_word = select(
                        self._memory_view[byte_offset:byte_offset + 8],
                        current_select_target - old_sum)
                    select_structure[level_0_idx][(current_select_target) //
                                                  8192] = (8 * (byte_offset - (
                                                      (1 << 29) * level_0_idx))
                                                           + select_in_word)
                    current_select_target += 8192

            for i in range(len(select_structure[level_0_idx])):
                assert select_structure[level_0_idx][i] >= 8192 * i
        return select_structure
Esempio n. 5
0
    def _initialize_rank_structure(self) -> "Tuple[array[int], array[int]]":
        # Need a level 0 entry for every 2**32 bits in the input array.
        level_0_size = math.ceil(len(self._bit_array) / (1 << 32))
        level_0 = array('Q', [0] * level_0_size)

        # How many L1/L2 entries do we need?
        # There is 1 64-bit entry for every 2048 bits in the input.
        # (Equivalently, 2 32-bit entries.)

        level_1_size = 2 * math.ceil(len(self._bit_array) / 2048)
        level_1 = array('L', [0] * level_1_size)

        # Iterate over the input bit array in size of at most 512 bits (64 bytes)
        bit_array_byte_length = len(self._memory_view)

        for byte_offset in range(0, bit_array_byte_length, 8):
            pop_count = popcount(self._memory_view[byte_offset:byte_offset +
                                                   8])

            # Update the Level 0 cumulative sum
            level_0_idx = 1 + byte_offset // (1 << 29)
            if level_0_idx < level_0_size:
                level_0[level_0_idx] += pop_count

            # Update the Level 1 cumulative sum.
            level_1_idx = (byte_offset // 256) * 2 + 2
            if level_1_idx < level_1_size:
                level_1[level_1_idx] += pop_count

            # Update the Level 2 non-cumulative relative counts.
            # (But only for basic blocks 0, 1, and 2.  (Skip basic block 3.))
            basic_block_index = (byte_offset % 256) // 64
            if basic_block_index != 3:
                level_2_idx = (byte_offset // 256) * 2 + 1
                packed_relative_counts = level_1[level_2_idx]
                packed_relative_counts = self._add_relative_count(
                    basic_block_index=basic_block_index,
                    packed_relative_counts=packed_relative_counts,
                    pop_count=pop_count)
                level_1[level_2_idx] = packed_relative_counts

        # Cumulative sums for level_0
        for i in range(1, level_0_size):
            level_0[i] += level_0[i - 1]

        # Cumulative sums for level_1.  Two-step process:
        #
        # 1. Zero out the level_1 cumulative sums for the blocks that lie at
        # the beginning of an L0 upper block. (If we don't do that, the sums
        # could overflow.)
        for byte_offset in range(0, bit_array_byte_length, 1 << 29):
            level_1_idx = 2 * (byte_offset // 256)
            level_1[level_1_idx] = 0

        # 2. calculate the cumulative sums for level_1.
        byte_offset = 0
        for i in range(0, level_1_size, 2):
            if byte_offset % (1 << 29) != 0:
                level_1[i] += level_1[i - 2]
            byte_offset += 256

        return (level_0, level_1)
Esempio n. 6
0
    def select(self, rank: int) -> int:
        """
        Returns the position of the 1-bit having the provided rank.
        If no such bit exists, -1 is returned.
        """

        # Use binary search to find the upper (L0) block that contains the
        # bit with the target rank.
        # level_0_idx = bisect.bisect_right(self._level_0, rank)
        level_0_idx = self._binary_search_level_0(rank)
        if level_0_idx < 0:
            level_0_idx = -(level_0_idx) - 1
            assert level_0_idx >= 0

        # Maintain an (absolute) bit range where the bit with the target rank
        # could be. This range if half open: [low, high)
        # low = (1<<32) * level_0_idx
        # high = min((1 << 32) * (level_0_idx + 1), len(self._bit_array))
        relative_rank = rank - self._level_0[level_0_idx]
        assert relative_rank >= 0

        # Search the sampling answers corresponding to level_0_idx
        # Use them to find the lower block that contains the target
        # bit.
        sampling_answers = self._select_structure[level_0_idx]
        x = relative_rank // 8192
        if relative_rank % 8192 == 0:
            # Just use one of the precomputed answers.
            if x < len(sampling_answers):
                return sampling_answers[x]
            return -1

        # Otherwise we have to search.
        search_start_bit = sampling_answers[x]
        if x + 1 < len(sampling_answers):
            search_end_bit = sampling_answers[x + 1]
        else:
            search_end_bit = min(
                len(self) - 1 - level_0_idx, (1 << 32) * (level_0_idx + 1))

        # Do a binary search for the L1 block that contains the 1-bit
        # with the desired relative rank.
        level_1_idx = self._binary_search_level_1(
            relative_rank, (search_start_bit // 2048) * 2,
            (search_end_bit // 2048) * 2 - 2)
        if level_1_idx < 0:
            level_1_idx = -(level_1_idx) - 2

        relative_rank -= self._level_1[level_1_idx]
        assert relative_rank >= 0
        packed_relative_counts = self._level_1[level_1_idx + 1]

        for basic_block_idx in range(0, 4):
            if basic_block_idx == 3:
                break
            relative_count = self._get_relative_count(
                basic_block_index=basic_block_idx,
                packed_relative_counts=packed_relative_counts)
            if relative_rank < relative_count:
                break
            relative_rank -= relative_count
            assert relative_rank >= 0

        # Now search within the 64-byte basic block.
        byte_offset = 64 * basic_block_idx + 256 * (level_1_idx // 2) + (
            1 << 29) * level_0_idx
        start_bit = 8 * (byte_offset // 64) * 64
        end_bit = min(start_bit + 4096,
                      len(self) - 1, (1 << 32) * (level_0_idx + 1) - 1)

        while start_bit + 64 <= end_bit:
            start_byte = start_bit // 8
            rank = popcount(self._memory_view[start_byte:(start_byte + 8)])
            if relative_rank < rank:
                return start_bit + select(
                    self._memory_view[start_byte:(start_byte + 8)],
                    relative_rank)

            relative_rank -= rank
            assert relative_rank >= 0
            start_bit += 64

        while start_bit + 8 <= end_bit:
            start_byte = start_bit // 8
            rank = RANK_IN_BYTE[256 * 7 + self._memory_view[start_byte]]
            if relative_rank < rank:
                return start_bit + SELECT_IN_BYTE[
                    256 * (relative_rank) + self._memory_view[start_byte]]
            relative_rank -= rank
            assert relative_rank >= 0
            start_bit += 8

        if start_bit < end_bit:
            slack = end_bit - start_bit - 1
            start_byte = start_bit // 8
            rank = RANK_IN_BYTE[256 * slack + self._memory_view[start_byte]]
            if relative_rank < rank:
                return start_bit + SELECT_IN_BYTE[
                    256 * (relative_rank) + self._memory_view[start_byte]]
            relative_rank -= rank
            assert relative_rank >= 0

        if relative_rank == 0 and self._bit_array[end_bit]:
            return end_bit

        return -1
Esempio n. 7
0
def test_popcount(bb: bytes) -> None:
    manual_popcount = sum(bin(b).count("1") for b in bb)
    assert popcount(bb) == manual_popcount