def get_register_value(raw_value, log2m):
    """
    Extracts the HLL register value from a raw value.
    """
    substream_value = BitUtil.unsigned_right_shift_long(raw_value, log2m)
    if substream_value == 0:
        # The paper does not cover p(0x0), so the special value 0 is used.
        # 0 is the original initialization value of the registers, so by
        # doing this the HLL simply ignores it. This is acceptable
        # because the probability is 1/(2^(2^register_size_in_bits)).
        p_w = 0
    else:
        p_w = BitUtil.to_signed_byte(
            min(1 + BitUtil.least_significant_bit(substream_value), 31))
    return p_w
Exemple #2
0
    def _add_raw_sparse_probabilistic(self, raw_value):
        """
        Adds the raw value to the ``sparseProbabilisticStorage``.
        ``type`` ``HLLType.SPARSE``.

        :param long raw_value: the raw value to add to the sparse storage.
        :rtype: void
        """

        # p(w): position of the least significant set bit (one-indexed)
        # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value)
        #
        # By construction of pw_max_mask (see constructor),
        #      lsb(pw_max_mask) = 2^(register_value_in_bits) - 2,
        # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2,
        # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1.
        sub_stream_value = BitUtil.unsigned_right_shift_long(
            raw_value, self._log2m)
        p_w = None

        if sub_stream_value == 0:
            # The paper does not cover p(0x0), so the special value 0 is used.
            # 0 is the original initialization value of the registers, so by
            # doing this the multiset simply ignores it. This is acceptable
            # because the probability is 1/(2^(2^register_size_in_bits)).
            p_w = 0
        else:
            p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(
                sub_stream_value | self._pw_max_mask))

        # Short-circuit if the register is being set to zero, since algorithmically
        # this corresponds to an "unset" register, and "unset" registers aren't
        # stored to save memory. (The very reason this sparse implementation
        # exists.) If a register is set to zero it will break the algorithm_cardinality
        # code.
        if p_w == 0:
            return

        # NOTE:  no +1 as in paper since 0-based indexing
        j = int(raw_value & self._m_bits_mask)

        current_value = self._sparse_probabilistic_storage.get(j, 0)
        if p_w > current_value:
            self._sparse_probabilistic_storage[j] = p_w
Exemple #3
0
    def from_bytes(cls, bytes):
        """
        Deserializes the HLL (in ``toBytes()`` format) serialized
        into ``bytes``.

        :param list bytes: the serialized bytes of new HLL
        :returns: the deserialized HLL. This will never be ``None``.
        :rtype: HLL
        """
        from python_hll.hllutil import HLLUtil
        schema_version = SerializationUtil.get_schema_version(bytes)
        metadata = schema_version.read_metadata(bytes)

        type = metadata.hll_type()
        reg_width = metadata.register_width()
        log_2m = metadata.register_count_log2()
        sparseon = metadata.sparse_enabled()

        expthresh = 0
        if metadata.explicit_auto():
            expthresh = -1
        elif metadata.explicit_off():
            expthresh = 0
        else:
            # NOTE: take into account that the postgres-compatible constructor
            # subtracts one before taking a power of two.
            expthresh = metadata.log2_explicit_cutoff() + 1

        hll = HLL(log_2m, reg_width, expthresh, sparseon, type)

        # Short-circuit on empty, which needs no other deserialization.
        if type == HLLType.EMPTY:
            return hll

        word_length = 0
        if type == HLLType.EXPLICIT:
            word_length = HLLUtil.LONG_BIT_LENGTH  # 64 for both java and python

        elif type == HLLType.SPARSE:
            word_length = hll._short_word_length

        elif type == HLLType.FULL:
            word_length = hll._regwidth

        else:
            raise Exception('Unsupported HLL type: {}'.format(type))

        deserializer = schema_version.get_deserializer(type, word_length,
                                                       bytes)
        if type == HLLType.EXPLICIT:
            # NOTE:  This should not exceed expthresh and this will always
            #        be exactly the number of words that were encoded,
            #        because the word length is at least a byte wide.
            # SEE:   BigEndianAscendingWordDeserializer.total_word_count()
            for i in range(deserializer.total_word_count()):
                hll._explicit_storage.add(deserializer.read_word())

        elif type == HLLType.SPARSE:
            # NOTE:  If the short_word_length were smaller than 8 bits
            #        (1 byte) there would be a possibility (because of
            #        padding arithmetic) of having one or more extra
            #        registers read. However, this is not relevant as the
            #        extra registers will be all zeroes, which are ignored
            #        in the sparse representation.
            for i in range(deserializer.total_word_count()):
                short_word = deserializer.read_word()

                register_value = BitUtil.to_signed_byte(short_word
                                                        & hll._value_mask)
                # Only set non-zero registers.
                if register_value != 0:
                    register_key = int(
                        BitUtil.unsigned_right_shift_long(
                            short_word, hll._regwidth))
                    hll._sparse_probabilistic_storage[
                        register_key] = register_value

        elif type == HLLType.FULL:
            # NOTE:  Iteration is done using m (register count) and NOT
            #        deserializer.total_word_count() because regwidth may be
            #        less than 8 and as such the padding on the 'last' byte
            #        may be larger than regwidth, causing an extra register
            #        to be read.
            # SEE: BigEndianAscendingWordDeserializer.total_word_count()
            for i in range(hll._m):
                hll._probabilistic_storage.set_register(
                    i, deserializer.read_word())

        else:
            raise Exception('Unsupported HLL type: {}'.format(type))

        return hll
def test_unsigned_right_shift_long2():
    assert BitUtil.unsigned_right_shift_long(-1, 0) == -1
def test_unsigned_right_shift_long():
    assert BitUtil.unsigned_right_shift_long(-100, 1) == 9223372036854775758