Example #1
0
def test_to_from_bytes():
    log2m = 11  # arbitrary
    regwidth = 5

    schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION
    type = HLLType.FULL
    padding = schema_version.padding_bytes(type)
    data_byte_count = probabilistic_test_util.get_required_bytes(
        regwidth, BitUtil.left_shift_int(1, log2m))  # aka 2^log2m = m
    expected_byte_count = padding + data_byte_count

    # Should work on an empty element
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    bytes = hll.to_bytes(schema_version)

    # assert output length is correct
    assert len(bytes) == expected_byte_count

    in_hll = HLL.from_bytes(bytes)
    assert_elements_equal(hll, in_hll)

    # Should work on a partially filled element
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    for i in range(0, 3):
        raw_value = probabilistic_test_util.construct_hll_value(
            log2m, i, (i + 9))
        hll.add_raw(raw_value)

    bytes = hll.to_bytes(schema_version)

    assert len(bytes) == expected_byte_count

    in_hll = HLL.from_bytes(bytes)

    # assert register values correct
    assert_elements_equal(hll, in_hll)

    # Should work on a full set
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    for i in range(0, BitUtil.left_shift_int(1, log2m)):
        raw_value = probabilistic_test_util.construct_hll_value(
            log2m, i, (i % 9) + 1)
        hll.add_raw(raw_value)

    bytes = hll.to_bytes(schema_version)

    # assert output length is correct
    assert len(bytes) == expected_byte_count

    in_hll = HLL.from_bytes(bytes)

    # assert register values correct
    assert_elements_equal(hll, in_hll)
def get_register_value(raw_value, log2m):
    """
    Extracts the HLL register value from a raw value.
    """
    substream_value = BitUtil.unsigned_right_shift_long(raw_value, log2m)
    if substream_value == 0:
        # The paper does not cover p(0x0), so the special value 0 is used.
        # 0 is the original initialization value of the registers, so by
        # doing this the HLL simply ignores it. This is acceptable
        # because the probability is 1/(2^(2^register_size_in_bits)).
        p_w = 0
    else:
        p_w = BitUtil.to_signed_byte(
            min(1 + BitUtil.least_significant_bit(substream_value), 31))
    return p_w
def construct_hll_value(log2m, register_index, register_value):
    """
    Constructs a value that when added raw to a HLL will set the register at
    ``register_index`` to ``register_value``.

    :param log2m: The log-base-2 of the number of registers in the HLL
    :type log2m: int
    :param register_index: The index of the register to set
    :type register_index: int
    :param register_value: the value to set the register to
    :type register_value: int
    :rtype: int
    """
    partition = register_index
    substream_value = BitUtil.left_shift_long(1, register_value - 1)
    return BitUtil.left_shift_long(substream_value, log2m) | partition
Example #4
0
def test_small_range_smoke():
    """
    Smoke test for HLL.cardinality() and the proper use of the
    small range correction.
    """
    log2m = 11
    m = BitUtil.left_shift_int(1, log2m)
    regwidth = 5

    # only one register set
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1))
    cardinality = hll.cardinality()

    # Trivially true that small correction conditions hold: one register
    # set implies zeroes exist, and estimator trivially smaller than 5m/2.
    # Small range correction: m * log(m/V)
    expected = ceil(m * log(m / (m - 1)))  # # of zeroes
    assert cardinality == expected

    # all but one register set
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    for i in range(0, m - 1):
        hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1))

    # Trivially true that small correction conditions hold: all but
    # one register set implies a zero exists, and estimator trivially
    # smaller than 5m/2 since it's alpha / ((m-1)/2)
    cardinality = hll.cardinality()

    # Small range correction: m * log(m/V)
    expected = ceil(m * log(m / 1))  # # of zeroes
    assert cardinality == expected
def get_register_index(raw_value, log2m):
    """
    Extracts the HLL register index from a raw value.
    """
    m_bits_mask = BitUtil.left_shift_int(1, log2m) - 1
    j = raw_value & m_bits_mask
    return j
Example #6
0
    def _sparse_probabilistic_algorithm_cardinality(self):
        """
        Computes the exact cardinality value returned by the HLL algorithm when
        represented as a ``HLLType.SPARSE`` HLL. Kept
        separate from ``cardinality()`` for testing purposes. ``type``
        must be ``HLLType.SPARSE``.

        :returns: the exact, unrounded cardinality given by the HLL algorithm
        :rtype: float
        """
        from python_hll.hllutil import HLLUtil
        m = self._m

        # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        # 'j'th register value
        indicator_function = 0.0
        number_of_zeroes = 0  # "V" in the paper
        for j in range(m):
            register = self._sparse_probabilistic_storage.get(j, 0)

            indicator_function += 1.0 / BitUtil.left_shift_long(1, register)
            if register == 0:
                number_of_zeroes += 1

        # apply the estimate and correction to the indicator function
        estimator = self._alpha_m_squared / indicator_function
        if number_of_zeroes != 0 and estimator < self._small_estimator_cutoff:
            return HLLUtil.small_estimator(m, number_of_zeroes)
        elif estimator <= self._large_estimator_cutoff:
            return estimator
        else:
            return HLLUtil.large_estimator(self._log2m, self._regwidth,
                                           estimator)
Example #7
0
def test_normal_range_smoke():
    """
    Smoke test for ``HLL.cardinality()`` and the proper use of the
    uncorrected estimator.
    """
    log2m = 11
    regwidth = 5

    # regwidth = 5, so hash space is
    # log2m + (2^5 - 1 - 1), so L = log2m + 30
    L = log2m + 30
    m = BitUtil.left_shift_int(1, log2m)
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    # all registers at 'medium' value
    register_value = 7  # chosen to ensure neither correction kicks in
    for i in range(0, m):
        hll.add_raw(
            probabilistic_test_util.construct_hll_value(
                log2m, i, register_value))

    cardinality = hll.cardinality()

    # Simplified estimator when all registers take same value: alpha / (m/2^val)
    estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))

    assert estimator <= (2**L) / 30
    assert estimator > (5 * m / 2)

    expected = ceil(estimator)
    assert cardinality == expected
Example #8
0
def test_large_range_smoke():
    """
    Smoke test for ``HLL.cardinality()`` and the proper use of the large
    range correction.
    """
    log2m = 12
    regwidth = 5
    # regwidth = 5, so hash space is
    # log2m + (2^5 - 1 - 1), so L = log2m + 30
    L = log2m + 30
    m = BitUtil.left_shift_int(1, log2m)
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    register_value = 31  # chosen to ensure large correction kicks in
    for i in range(0, m):
        hll.add_raw(
            probabilistic_test_util.construct_hll_value(
                log2m, i, register_value))

    cardinality = hll.cardinality()

    # Simplified estimator when all registers take same value: alpha / (m/2^val)
    estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))

    # Assert conditions for large range

    assert estimator > (2**L) / 30

    # Large range correction: -2^L * log(1 - E/2^L)
    try:
        expected = ceil(-1.0 * (2**L) * log(1.0 - estimator / (2**L)))
    except ValueError:
        expected = 0
    assert cardinality == expected
Example #9
0
    def _full_probabilistic_algorithm_cardinality(self):
        """
        Computes the exact cardinality value returned by the HLL algorithm when
        represented as a ``HLLType.FULL`` HLL. Kept separate from ``cardinality()`` for testing purposes.
        type must be ``HLLType.FULL``.

        :rtype: float
        """
        from python_hll.hllutil import HLLUtil
        # for performance
        m = self._m
        # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        # 'j'th register value
        sum = 0
        number_of_zeroes = 0  # "V" in the paper
        iterator = self._probabilistic_storage.register_iterator()
        for register in iterator:
            sum += 1.0 / BitUtil.left_shift_long(1, register)
            if register == 0:
                number_of_zeroes += 1
        # apply the estimate and correction to the indicator function
        estimator = self._alpha_m_squared / sum
        if number_of_zeroes != 0 and (estimator <
                                      self._small_estimator_cutoff):
            return HLLUtil.small_estimator(m, number_of_zeroes)
        elif estimator <= self._large_estimator_cutoff:
            return estimator
        else:
            return HLLUtil.large_estimator(self._log2m, self._regwidth,
                                           estimator)
Example #10
0
    def _add_raw_sparse_probabilistic(self, raw_value):
        """
        Adds the raw value to the ``sparseProbabilisticStorage``.
        ``type`` ``HLLType.SPARSE``.

        :param long raw_value: the raw value to add to the sparse storage.
        :rtype: void
        """

        # p(w): position of the least significant set bit (one-indexed)
        # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value)
        #
        # By construction of pw_max_mask (see constructor),
        #      lsb(pw_max_mask) = 2^(register_value_in_bits) - 2,
        # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2,
        # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1.
        sub_stream_value = BitUtil.unsigned_right_shift_long(
            raw_value, self._log2m)
        p_w = None

        if sub_stream_value == 0:
            # The paper does not cover p(0x0), so the special value 0 is used.
            # 0 is the original initialization value of the registers, so by
            # doing this the multiset simply ignores it. This is acceptable
            # because the probability is 1/(2^(2^register_size_in_bits)).
            p_w = 0
        else:
            p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(
                sub_stream_value | self._pw_max_mask))

        # Short-circuit if the register is being set to zero, since algorithmically
        # this corresponds to an "unset" register, and "unset" registers aren't
        # stored to save memory. (The very reason this sparse implementation
        # exists.) If a register is set to zero it will break the algorithm_cardinality
        # code.
        if p_w == 0:
            return

        # NOTE:  no +1 as in paper since 0-based indexing
        j = int(raw_value & self._m_bits_mask)

        current_value = self._sparse_probabilistic_storage.get(j, 0)
        if p_w > current_value:
            self._sparse_probabilistic_storage[j] = p_w
Example #11
0
def test_clear():
    """
    Tests HLL.clear().
    """
    regwidth = 5
    log2m = 4  # 16 registers per counter
    m = BitUtil.left_shift_int(1, log2m)

    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    bit_vector = hll._probabilistic_storage
    for i in range(0, m):
        bit_vector.set_register(i, i)

    hll.clear()
    for i in range(0, m):
        assert bit_vector.get_register(i) == 0  # default value of register
Example #12
0
def run_ascending_test(word_length, byte_padding, word_count):
    """
    Runs a test which serializes and deserializes ascending (from zero) word values.
    """
    word_mask = ~0 if word_length == 64 else BitUtil.left_shift_long(1, word_length) - 1

    serializer = BigEndianAscendingWordSerializer(word_length, word_count, byte_padding)

    for i in range(word_count):
        serializer.write_word(i & word_mask)

    bytes_ = serializer.get_bytes()

    deserializer = BigEndianAscendingWordDeserializer(word_length, byte_padding, bytes_)

    assert deserializer.total_word_count() == word_count

    for i in range(word_count):
        assert deserializer.read_word() == (i & word_mask)
Example #13
0
def run_random_test(word_length, byte_padding, word_count, seed):
    """
    Runs a test which serializes and deserializes random word values.
    """
    random.seed(seed)

    word_mask = ~0 if word_length == 64 else BitUtil.left_shift_long(1, word_length) - 1

    serializer = BigEndianAscendingWordSerializer(word_length, word_count, byte_padding)

    for _ in range(word_count):
        value = random.randint(0, maxsize) & word_mask
        serializer.write_word(value)

    bytes_ = serializer.get_bytes()

    deserializer = BigEndianAscendingWordDeserializer(word_length, byte_padding, bytes_)

    assert deserializer.total_word_count() == word_count

    # verification random
    random.seed(seed)
    for _ in range(word_count):
        assert deserializer.read_word() == (random.randint(0, maxsize) & word_mask)
Example #14
0
class HLL:
    """
    A probabilistic set of hashed ``long`` elements. Useful for computing
    the approximate cardinality of a stream of data in very small storage.

    A modified version of the `'HyperLogLog' data structure and algorithm
    <http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf>`_ is used,
    which combines both probabilistic and non-probabilistic techniques to
    improve the accuracy and storage requirements of the original algorithm.

    More specifically, initializing and storing a new HLL will
    allocate a sentinel value symbolizing the empty set (HLLType.EMPTY).
    After adding the first few values, a sorted list of unique integers is
    stored in a HLLType.EXPLICIT hash set. When configured, accuracy can
    be sacrificed for memory footprint: the values in the sorted list are
    "promoted" to a "HLLType.SPARSE" map-based HyperLogLog structure.
    Finally, when enough registers are set, the map-based HLL will be converted
    to a bit-packed "HLLType.FULL" HyperLogLog structure.

    This data structure is interoperable with the implementations found at:

    * `postgresql-hll <https://github.com/aggregateknowledge/postgresql-hll>`_
    * `js-hll <https://github.com/aggregateknowledge/js-hll>`_

    when `properly serialized <https://github.com/aggregateknowledge/postgresql-hll/blob/master/STORAGE.markdown>`_.
    """

    # minimum and maximum values for the log-base-2 of the number of registers
    # in the HLL
    MINIMUM_LOG2M_PARAM = 4
    MAXIMUM_LOG2M_PARAM = 30

    # minimum and maximum values for the register width of the HLL
    MINIMUM_REGWIDTH_PARAM = 1
    MAXIMUM_REGWIDTH_PARAM = 8

    # minimum and maximum values for the 'expthresh' parameter of the
    # constructor that is meant to match the PostgreSQL implementation's
    # constructor and parameter names
    MINIMUM_EXPTHRESH_PARAM = -1
    MAXIMUM_EXPTHRESH_PARAM = 18
    MAXIMUM_EXPLICIT_THRESHOLD = BitUtil.left_shift_int(
        1, (MAXIMUM_EXPTHRESH_PARAM - 1))  # per storage spec

    # ------------------------------------------------------------
    # STORAGE
    # :var set _explicit_storage: storage used when ``type`` is EXPLICIT, None otherwise
    # :var dict _sparse_probabilistic_storage: storage used when ``type`` is SPARSE, None otherwise
    # :var BitVector _probabilistic_storage: storage used when ``type`` is FULL, None otherwise
    # :var HLLType type: current type of this HLL instance, if this changes then so should the storage used (see above)

    # ------------------------------------------------------------
    # CHARACTERISTIC PARAMETERS
    # NOTE:  These members are named to match the PostgreSQL implementation's parameters.
    # :var int _log2m: log2(the number of probabilistic HLL registers)
    # :var int _regwidth: the size (width) each register in bits

    # ------------------------------------------------------------
    # COMPUTED CONSTANTS
    # ............................................................
    # EXPLICIT-specific constants
    # :var boolean _explicit_off: flag indicating if the EXPLICIT representation should NOT be used
    # :var boolean _explicit_auto: flag indicating that the promotion threshold from EXPLICIT should be
    #              computed automatically. NOTE:  this only has meaning when '_explicit_off' is false.
    # :var int _explicit_threshold: threshold (in element count) at which a EXPLICIT HLL is converted to a
    #           SPARSE or FULL HLL, always greater than or equal to zero and always a power of two OR simply zero
    #           NOTE:  this only has meaning when '_explicit_off' is false
    # ............................................................
    # SPARSE-specific constants
    # :var int _short_word_length: the computed width of the short words
    # :var boolean _sparse_off: flag indicating if the SPARSE representation should not be used
    # :var int _sparse_threshold: threshold (in register count) at which a SPARSE HLL is converted to a
    #          FULL HLL, always greater than zero
    # ............................................................
    # Probabilistic algorithm constants
    # :var int _m: the number of registers, will always be a power of 2
    # :var int _m_bits_mask: a mask of the log2m bits set to one and the rest to zero
    # :var int _value_mask: a mask as wide as a register (see ``from_bytes()``)
    # :var long _long_pw_mask: mask used to ensure that p(w) does not overflow register (see ``__init__()`` and ``add_raw()``)
    # ;var float _alpha_m_squared: alpha * m^2 (the constant in the "'raw' HyperLogLog estimator")
    # :var float _small_estimator_cutoff: the cutoff value of the estimator for using the "small" range cardinality correction formula
    # :var float _large_estimator_cutoff: the cutoff value of the estimator for using the "large" range cardinality correction formula

    def __init__(self,
                 log2m,
                 regwidth,
                 expthresh=-1,
                 sparseon=True,
                 type=HLLType.EMPTY):
        """
        NOTE: Arguments here are named and structured identically to those in the
              PostgreSQL implementation, which can be found
              `here <https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning>`_.

        :param log2m: log-base-2 of the number of registers used in the HyperLogLog
               algorithm. Must be at least 4 and at most 30.
        :type log2m: int
        :param regwidth: number of bits used per register in the HyperLogLog
               algorithm. Must be at least 1 and at most 8.
        :type regwidth: int
        :param expthresh: tunes when the ``HLLType.EXPLICIT`` to
               ``HLLType.SPARSE`` promotion occurs,
               based on the set's cardinality. Must be at least -1 and at most 18.
               +-----------+--------------------------------------------------------------------------------+
               | expthresh | Meaning                                                                        |
               +===========+================================================================================+
               | -1        | Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode) |
               +-----------+--------------------------------------------------------------------------------+
               | 0         | Skip ``EXPLICIT`` representation in hierarchy.                                 |
               +-----------+--------------------------------------------------------------------------------+
               | 1-18      | Promote at 2:sup:`expthresh - 1` cardinality                                   |
               +-----------+--------------------------------------------------------------------------------+
        :type expthresh: int
        :param sparseon: Flag indicating if the ``HLLType.SPARSE``
               representation should be used.
        :type sparseon: boolean
        :param type: the type in the promotion hierarchy which this instance should
               start at. This cannot be ``None``.
        :type type: HLLType
        """
        from python_hll.hllutil import HLLUtil

        self._log2m = log2m
        if log2m < HLL.MINIMUM_LOG2M_PARAM or log2m > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
            raise Exception("'log2m' must be at least " +
                            str(HLL.MINIMUM_LOG2M_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_LOG2M_PARAM) + " (was: " +
                            str(log2m) + ")")

        self._regwidth = regwidth
        if regwidth < HLL.MINIMUM_REGWIDTH_PARAM or regwidth > HLL.MAXIMUM_REGWIDTH_PARAM:
            raise Exception("'regwidth' must be at least " +
                            str(HLL.MINIMUM_REGWIDTH_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_REGWIDTH_PARAM) + " (was: " +
                            str(regwidth) + ")")

        self._m = BitUtil.left_shift_int(1, log2m)
        self._m_bits_mask = self._m - 1
        self._value_mask = BitUtil.left_shift_int(1, regwidth) - 1
        self._pw_max_mask = HLLUtil.pw_max_mask(regwidth)
        self._alpha_m_squared = HLLUtil.alpha_m_squared(self._m)
        self._small_estimator_cutoff = HLLUtil.small_estimator_cutoff(self._m)
        self._large_estimator_cutoff = HLLUtil.large_estimator_cutoff(
            log2m, regwidth)

        if expthresh == -1:
            self._explicit_auto = True
            self._explicit_off = False

            # NOTE:  This math matches the size calculation in the PostgreSQL impl.
            full_representation_size = floor((self._regwidth * self._m + 7) /
                                             8)  # round up to next whole byte
            num_longs = floor(full_representation_size /
                              8)  # integer division to round down

            if num_longs > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
                self._explicit_threshold = HLL.MAXIMUM_EXPLICIT_THRESHOLD
            else:
                self._explicit_threshold = num_longs
        elif expthresh == 0:
            self._explicit_auto = False
            self._explicit_off = True
            self._explicit_threshold = 0
        elif 0 < expthresh <= HLL.MAXIMUM_EXPTHRESH_PARAM:
            self._explicit_auto = False
            self._explicit_off = False
            self._explicit_threshold = BitUtil.left_shift_int(
                1, (expthresh - 1))
        else:
            raise Exception("'expthresh' must be at least " +
                            str(HLL.MINIMUM_EXPTHRESH_PARAM) +
                            " and at most " +
                            str(HLL.MAXIMUM_EXPTHRESH_PARAM) + " (was: " +
                            str(expthresh) + ")")

        self._short_word_length = regwidth + log2m
        self._sparse_off = not sparseon
        if self._sparse_off:
            self._sparse_threshold = 0
        else:
            # TODO improve this cutoff to include the cost overhead of members/objects
            largest_pow_2_less_than_cutoff = int(
                NumberUtil.log2(
                    (self._m * self._regwidth) / self._short_word_length))
            self._sparse_threshold = BitUtil.left_shift_int(
                1, largest_pow_2_less_than_cutoff)

        self._initialize_storage(type)

    @classmethod
    def create_for_testing(cls, log2m, regwidth, explicit_threshold,
                           sparse_threshold, type):
        """
        Convenience constructor for testing. Assumes that both ``HLLType.EXPLICIT``
        and ``HLLType.SPARSE`` representations should be enabled.

        :param log2m: log-base-2 of the number of registers used in the HyperLogLog
               algorithm. Must be at least 4 and at most 30.
        :type log2m: int
        :param regwidth: number of bits used per register in the HyperLogLog
               algorithm. Must be at least 1 and at most 8.
        :type regwidth: int
        :param explicit_threshold: cardinality threshold at which the ``HLLType.EXPLICIT``
               representation should be promoted to ``HLLType.SPARSE``.
               This must be greater than zero and less than or equal to ``MAXIMUM_EXPLICIT_THRESHOLD``.
        :type explicit_threshold: int
        :param sparse_threshold: register count threshold at which the ``HLLType.SPARSE``
               representation should be promoted to ``HLLType.FULL``.
               This must be greater than zero.
        :type sparse_threshold: int
        :param type: the type in the promotion hierarchy which this instance should
               start at. This cannot be ``None``.
        :type type: HLLType
        :rtype: HLL
        """
        hll = HLL(log2m=log2m,
                  regwidth=regwidth,
                  expthresh=-1,
                  sparseon=True,
                  type=type)
        hll._explicit_auto = False
        hll._explicit_off = False
        hll._explicit_threshold = explicit_threshold
        if explicit_threshold < 1 or explicit_threshold > cls.MAXIMUM_EXPLICIT_THRESHOLD:
            raise Exception(
                "'explicit_threshold' must be at least 1 and at most " +
                str(cls.MAXIMUM_EXPLICIT_THRESHOLD) + " (was: " +
                str(explicit_threshold) + ")")
        hll._sparse_off = False
        hll._sparse_threshold = sparse_threshold
        return hll

    def get_type(self):
        """
        Returns the type in the promotion hierarchy of this instance. This will
        never be ``None``.

        :rtype: HLLType
        """
        return self._type

    def add_raw(self, raw_value):
        """
        Adds ``rawValue`` directly to the HLL.

        :param long raw_value: the value to be added. It is very important that this
               value already be hashed with a strong (but not
               necessarily cryptographic) hash function. For instance, the
               `MurmurHash3 implementation <https://pypi.org/project/mmh3/>`_
               is an excellent hash function for this purpose.
        :rtype: void
        """

        if self._type == HLLType.EMPTY:
            # Note: EMPTY type is always promoted on add_raw()
            if self._explicit_threshold > 0:
                self._initialize_storage(HLLType.EXPLICIT)
                self._explicit_storage.add(raw_value)
            elif not self._sparse_off:
                self._initialize_storage(HLLType.SPARSE)
                self._add_raw_sparse_probabilistic(raw_value)
            else:
                self._initialize_storage(HLLType.FULL)
                self._add_raw_probabilistic(raw_value)
            return

        elif self._type == HLLType.EXPLICIT:
            self._explicit_storage.add(raw_value)

            # promotion, if necessary
            if len(self._explicit_storage) > self._explicit_threshold:
                if not self._sparse_off:
                    self._initialize_storage(HLLType.SPARSE)
                    for value in self._explicit_storage:
                        self._add_raw_sparse_probabilistic(value)
                else:
                    self._initialize_storage(HLLType.FULL)
                    for value in self._explicit_storage:
                        self._add_raw_probabilistic(value)
                self._explicit_storage = None
            return

        elif self._type == HLLType.SPARSE:
            self._add_raw_sparse_probabilistic(raw_value)

            # promotion, if necessary
            if len(self._sparse_probabilistic_storage
                   ) > self._sparse_threshold:
                self._initialize_storage(HLLType.FULL)
                for register_index in self._sparse_probabilistic_storage.keys(
                ):
                    register_value = self._sparse_probabilistic_storage.get(
                        register_index, 0)
                    self._probabilistic_storage.set_max_register(
                        register_index, register_value)
                self._sparse_probabilistic_storage = None
            return

        elif self._type == HLLType.FULL:
            self._add_raw_probabilistic(raw_value)
            return

        else:
            raise Exception("Unsupported HLL type: {}".format(self._type))

    def _add_raw_sparse_probabilistic(self, raw_value):
        """
        Adds the raw value to the ``sparseProbabilisticStorage``.
        ``type`` ``HLLType.SPARSE``.

        :param long raw_value: the raw value to add to the sparse storage.
        :rtype: void
        """

        # p(w): position of the least significant set bit (one-indexed)
        # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value)
        #
        # By construction of pw_max_mask (see constructor),
        #      lsb(pw_max_mask) = 2^(register_value_in_bits) - 2,
        # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2,
        # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1.
        sub_stream_value = BitUtil.unsigned_right_shift_long(
            raw_value, self._log2m)
        p_w = None

        if sub_stream_value == 0:
            # The paper does not cover p(0x0), so the special value 0 is used.
            # 0 is the original initialization value of the registers, so by
            # doing this the multiset simply ignores it. This is acceptable
            # because the probability is 1/(2^(2^register_size_in_bits)).
            p_w = 0
        else:
            p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(
                sub_stream_value | self._pw_max_mask))

        # Short-circuit if the register is being set to zero, since algorithmically
        # this corresponds to an "unset" register, and "unset" registers aren't
        # stored to save memory. (The very reason this sparse implementation
        # exists.) If a register is set to zero it will break the algorithm_cardinality
        # code.
        if p_w == 0:
            return

        # NOTE:  no +1 as in paper since 0-based indexing
        j = int(raw_value & self._m_bits_mask)

        current_value = self._sparse_probabilistic_storage.get(j, 0)
        if p_w > current_value:
            self._sparse_probabilistic_storage[j] = p_w

    def _add_raw_probabilistic(self, raw_value):
        """
        Adds the raw value to the ``probabilisticStorage``.
        ``type`` must be ``HLLType.FULL``.

        :param long raw_value: the raw value to add to the full probabilistic storage.
        :rtype: void
        """
        # p(w): position of the least significant set bit (one-indexed)
        # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value)
        #
        # By construction of pw_max_mask (see constructor),
        #      lsb(pw_max_mask) = 2^(register_value_in_bits) - 2,
        # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2,
        # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1.
        sub_stream_value = BitUtil.unsigned_right_shift_long(
            raw_value, self._log2m)
        p_w = None

        if sub_stream_value == 0:
            # The paper does not cover p(0x0), so the special value 0 is used.
            # 0 is the original initialization value of the registers, so by
            # doing this the multiset simply ignores it. This is acceptable
            # because the probability is 1/(2^(2^register_size_in_bits)).
            p_w = 0
        else:
            p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit(
                sub_stream_value | self._pw_max_mask))

        # Short-circuit if the register is being set to zero, since algorithmically
        # this corresponds to an "unset" register, and "unset" registers aren't
        # stored to save memory. (The very reason this sparse implementation
        # exists.) If a register is set to zero it will break the algorithm_cardinality
        # code.
        if p_w == 0:
            return

        # NOTE:  no +1 as in paper since 0-based indexing
        j = int(raw_value & self._m_bits_mask)

        self._probabilistic_storage.set_max_register(j, p_w)

    def _initialize_storage(self, type):
        """
        Initializes storage for the specified ``HLLType`` and changes the
        instance's ``type``.

        :param HLLType type: the ``HLLType`` to initialize storage for. This cannot be
               ``None`` and must be an instantiable type. (For instance,
               it cannot be ``HLLType.UNDEFINED``.)
        :rtype: void
        """
        self._type = type
        if type == HLLType.EMPTY:
            # nothing to be done
            pass
        elif type == HLLType.EXPLICIT:
            self._explicit_storage = set()
        elif type == HLLType.SPARSE:
            self._sparse_probabilistic_storage = dict()
        elif type == HLLType.FULL:
            self._probabilistic_storage = BitVector(self._regwidth, self._m)
        else:
            raise Exception("Unsupported HLL type: {}".format(self._type))

    def cardinality(self):
        """
        Computes the cardinality of the HLL.

        :returns: the cardinality of HLL. This will never be negative.
        :rtype: long
        """
        if self._type == HLLType.EMPTY:
            return 0  # by definition
        elif self._type == HLLType.EXPLICIT:
            return len(self._explicit_storage)
        elif self._type == HLLType.SPARSE:
            return ceil(self._sparse_probabilistic_algorithm_cardinality())
        elif self._type == HLLType.FULL:
            return ceil(self._full_probabilistic_algorithm_cardinality())
        else:
            raise Exception("Unsupported HLL type: {}".format(self._type))

    def _sparse_probabilistic_algorithm_cardinality(self):
        """
        Computes the exact cardinality value returned by the HLL algorithm when
        represented as a ``HLLType.SPARSE`` HLL. Kept
        separate from ``cardinality()`` for testing purposes. ``type``
        must be ``HLLType.SPARSE``.

        :returns: the exact, unrounded cardinality given by the HLL algorithm
        :rtype: float
        """
        from python_hll.hllutil import HLLUtil
        m = self._m

        # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        # 'j'th register value
        indicator_function = 0.0
        number_of_zeroes = 0  # "V" in the paper
        for j in range(m):
            register = self._sparse_probabilistic_storage.get(j, 0)

            indicator_function += 1.0 / BitUtil.left_shift_long(1, register)
            if register == 0:
                number_of_zeroes += 1

        # apply the estimate and correction to the indicator function
        estimator = self._alpha_m_squared / indicator_function
        if number_of_zeroes != 0 and estimator < self._small_estimator_cutoff:
            return HLLUtil.small_estimator(m, number_of_zeroes)
        elif estimator <= self._large_estimator_cutoff:
            return estimator
        else:
            return HLLUtil.large_estimator(self._log2m, self._regwidth,
                                           estimator)

    def _full_probabilistic_algorithm_cardinality(self):
        """
        Computes the exact cardinality value returned by the HLL algorithm when
        represented as a ``HLLType.FULL`` HLL. Kept separate from ``cardinality()`` for testing purposes.
        type must be ``HLLType.FULL``.

        :rtype: float
        """
        from python_hll.hllutil import HLLUtil
        # for performance
        m = self._m
        # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        # 'j'th register value
        sum = 0
        number_of_zeroes = 0  # "V" in the paper
        iterator = self._probabilistic_storage.register_iterator()
        for register in iterator:
            sum += 1.0 / BitUtil.left_shift_long(1, register)
            if register == 0:
                number_of_zeroes += 1
        # apply the estimate and correction to the indicator function
        estimator = self._alpha_m_squared / sum
        if number_of_zeroes != 0 and (estimator <
                                      self._small_estimator_cutoff):
            return HLLUtil.small_estimator(m, number_of_zeroes)
        elif estimator <= self._large_estimator_cutoff:
            return estimator
        else:
            return HLLUtil.large_estimator(self._log2m, self._regwidth,
                                           estimator)

    def clear(self):
        """
        Clears the HLL. The HLL will have cardinality zero and will act as if no
        elements have been added.

        NOTE: Unlike ``addRaw(long)``, ``clear`` does NOT handle
        transitions between ``HLLType``'s - a probabilistic type will remain
        probabilistic after being cleared.

        :rtype: void
        """
        if self._type == HLLType.EMPTY:
            return  # do nothing
        elif self._type == HLLType.EXPLICIT:
            return self._explicit_storage.clear()
        elif self._type == HLLType.SPARSE:
            return self._sparse_probabilistic_storage.clear()
        elif self._type == HLLType.FULL:
            self._probabilistic_storage.fill(0)
            return
        else:
            raise Exception('Unsupported HLL type: {}'.format(self._type))

    def union(self, other):
        """
        Computes the union of HLLs and stores the result in this instance.

        :param HLL other: the other ``HLL`` instance to union into this one. This
               cannot be ``None``.
        :rtype: void
        """
        # TODO: verify HLL compatibility
        other_type = other.get_type()

        if self._type == other_type:
            self._homogeneous_union(other)
        else:
            self._heterogenous_union(other)

    def _heterogeneous_union_for_empty_hll(self, other):
        # The union of empty with non-empty HLL is just a clone of the non-empty.

        if other.get_type() == HLLType.EXPLICIT:
            # src: EXPLICIT
            # dest: EMPTY

            if len(other._explicit_storage) <= self._explicit_threshold:
                self._type = HLLType.EXPLICIT
                self._explicit_storage = deepcopy(other._explicit_storage)
            else:
                if not self._sparse_off:
                    self._initialize_storage(HLLType.SPARSE)
                else:
                    self._initialize_storage(HLLType.FULL)

                for value in other._explicit_storage:
                    self.add_raw(value)

        elif other.get_type() == HLLType.SPARSE:
            # src: SPARSE
            # dest: EMPTY

            if not self._sparse_off:
                self._type = HLLType.SPARSE
                self._sparse_probabilistic_storage = deepcopy(
                    other._sparse_probabilistic_storage)
            else:
                self._initialize_storage(HLLType.FULL)
                for register_index in other._sparse_probabilistic_storage.keys(
                ):
                    register_value = other._sparse_probabilistic_storage.get(
                        register_index)
                    self._probabilistic_storage.set_max_register(
                        register_index, register_value)
            return

        else:  # case FULL
            # src: FULL
            # dest: EMPTY
            self._type = HLLType.FULL
            self._probabilistic_storage = deepcopy(
                other._probabilistic_storage)
            return

    def _heterogeneous_union_for_non_empty_hll(self, other):
        if self._type == HLLType.EXPLICIT:
            # src:  FULL/SPARSE
            # dest: EXPLICIT
            # "Storing into destination" cannot be done (since destination
            # is by definition of smaller capacity than source), so a clone
            # of source is made and values from destination are inserted
            # into that.

            # Determine source and destination storage.
            # NOTE:  destination storage may change through promotion if
            #        source is SPARSE.

            if other.get_type() == HLLType.SPARSE:
                if not self._sparse_off:
                    self._type = HLLType.SPARSE
                    self._sparse_probabilistic_storage = deepcopy(
                        other._sparse_probabilistic_storage)
                else:
                    self._initialize_storage(HLLType.FULL)
                    for register_index in other._sparse_probabilistic_storage.keys(
                    ):
                        register_value = other._sparse_probabilistic_storage.get(
                            register_index)
                        self._probabilistic_storage.set_max_register(
                            register_index, register_value)

            else:  # source is HLLType.FULL
                self._type = HLLType.FULL
                self._probabilistic_storage = deepcopy(
                    other._probabilistic_storage)

            for value in self._explicit_storage:
                self.add_raw(value)
            self._explicit_storage = None
            return

        elif self._type == HLLType.SPARSE:
            if other.get_type() == HLLType.EXPLICIT:
                # src: EXPLICIT
                # dest: SPARSE
                # Add the raw values from the source to the destination.

                for value in other._explicit_storage:
                    # NOTE: add_raw will handle promotion cleanup
                    self.add_raw(value)

            else:  # source is HLLType.FULL
                # src:  FULL
                # dest: SPARSE
                # "Storing into destination" cannot be done (since destination
                # is by definition of smaller capacity than source), so a
                # clone of source is made and registers from the destination
                # are merged into the clone.

                self._type = HLLType.FULL
                self._probabilistic_storage = deepcopy(
                    other._probabilistic_storage)
                for register_index in self._sparse_probabilistic_storage.keys(
                ):
                    register_value = self._sparse_probabilistic_storage.get(
                        register_index, 0)
                    self._probabilistic_storage.set_max_register(
                        register_index, register_value)
                self._sparse_probabilistic_storage = None

        else:  # destination is HLLType.FULL
            if other._type == HLLType.EXPLICIT:
                # src: EXPLICIT
                # dest: FULL
                # Add the raw values from the source to the destination.
                # Promotion is not possible, so don't bother checking.

                for value in other._explicit_storage:
                    self.add_raw(value)

            else:  # source is HLLType.SPARSE
                # src: SPARSE
                # dest: FULL
                # Merge the registers from the source into the destination.
                # Promotion is not possible, so don't bother checking.

                for register_index in other._sparse_probabilistic_storage.keys(
                ):
                    register_value = other._sparse_probabilistic_storage.get(
                        register_index)
                    self._probabilistic_storage.set_max_register(
                        register_index, register_value)

    def _heterogenous_union(self, other):
        """
        The logic here is divided into two sections: unions with an EMPTY
        HLL, and unions between EXPLICIT/SPARSE/FULL HLL.

        Between those two sections, all possible heterogeneous unions are
        covered. Should another type be added to HLLType whose unions
        are not easily reduced (say, as EMPTY's are below) this may be more
        easily implemented as Strategies. However, that is unnecessary as it
        stands.
        :type other: HLL
        :rtype: void
        """

        # Union with an EMPTY
        if self._type == HLLType.EMPTY:
            self._heterogeneous_union_for_empty_hll(other)
            return
        elif other.get_type() == HLLType.EMPTY:
            # source is empty, so just return destination since it is unchanged
            return

        # else -- both of the sets are not empty
        self._heterogeneous_union_for_non_empty_hll(other)

    def _homogeneous_union(self, other):
        """
        Computes the union of two HLLs of the same type, and stores the
        result in this instance.

        :param HLL other: the other ``HLL`` instance to union into this one. This
               cannot be ``None``.
        :rtype: void
        """
        if self._type == HLLType.EMPTY:
            # union of empty and empty is empty
            return

        elif self._type == HLLType.EXPLICIT:
            for value in other._explicit_storage:
                # Note: add_raw() will handle promotion, if necessary
                self.add_raw(value)

        elif self._type == HLLType.SPARSE:

            for register_index in other._sparse_probabilistic_storage.keys():
                register_value = other._sparse_probabilistic_storage.get(
                    register_index)
                current_register_value = self._sparse_probabilistic_storage.get(
                    register_index, 0)
                if register_value > current_register_value:
                    self._sparse_probabilistic_storage[
                        register_index] = register_value

            # promotion, if necessary
            if len(self._sparse_probabilistic_storage
                   ) > self._sparse_threshold:
                self._initialize_storage(HLLType.FULL)
                for register_index in self._sparse_probabilistic_storage.keys(
                ):
                    register_value = self._sparse_probabilistic_storage.get(
                        register_index, 0)
                    self._probabilistic_storage.set_max_register(
                        register_index, register_value)

                self._sparse_probabilistic_storage = None

        elif self._type == HLLType.FULL:
            for i in range(self._m):
                register_value = other._probabilistic_storage.get_register(i)
                self._probabilistic_storage.set_max_register(i, register_value)
            return

        else:
            raise Exception('Unsupported HLL type: {}'.format(self._type))

    def to_bytes(self,
                 schema_version=SerializationUtil.DEFAULT_SCHEMA_VERSION):
        """
        Serializes the HLL to an array of bytes in correspondence with the format
        of the default schema version, ``SerializationUtil.DEFAULT_SCHEMA_VERSION``.

        :param SchemaVersion schema_version: the schema version dictating the serialization format
        :returns: the array of bytes representing the HLL. This will never be
                  ``None`` or empty.
        :rtype: list
        """
        from python_hll.hllutil import HLLUtil
        if self._type == HLLType.EMPTY:
            byte_array_length = schema_version.padding_bytes(self._type)
            byte_array = [0] * byte_array_length

        elif self._type == HLLType.EXPLICIT:
            serializer = schema_version.get_serializer(
                self._type, HLLUtil.LONG_BIT_LENGTH,
                len(self._explicit_storage))

            values = list(self._explicit_storage)
            values = sorted(values)
            for value in values:
                serializer.write_word(value)

            byte_array = serializer.get_bytes()

        elif self._type == HLLType.SPARSE:
            serializer = schema_version.get_serializer(
                self._type, self._short_word_length,
                len(self._sparse_probabilistic_storage))

            indices = self._sparse_probabilistic_storage.keys()
            indices = sorted(indices)

            for register_index in indices:
                register_value = self._sparse_probabilistic_storage.get(
                    register_index, 0)

                # pack index and value into "short word"
                short_word = BitUtil.left_shift_int(
                    register_index, self._regwidth) | register_value
                serializer.write_word(short_word)

            byte_array = serializer.get_bytes()

        elif self._type == HLLType.FULL:
            serializer = schema_version.get_serializer(self._type,
                                                       self._regwidth, self._m)
            self._probabilistic_storage.get_register_contents(serializer)

            byte_array = serializer.get_bytes()

        else:
            raise Exception('Unsupported HLL type: {}'.format(self._type))

        # no use of it if any _explicit_off or _explicit_auto is true
        log2_explicit_threshold = 0
        if not self._explicit_auto | self._explicit_off:
            log2_explicit_threshold = int(
                NumberUtil.log2(self._explicit_threshold))

        metadata = HLLMetadata(schema_version.schema_version_number(),
                               self._type, self._log2m, self._regwidth,
                               log2_explicit_threshold, self._explicit_off,
                               self._explicit_auto, not self._sparse_off)
        schema_version.write_metadata(byte_array, metadata)

        return byte_array

    @classmethod
    def from_bytes(cls, bytes):
        """
        Deserializes the HLL (in ``toBytes()`` format) serialized
        into ``bytes``.

        :param list bytes: the serialized bytes of new HLL
        :returns: the deserialized HLL. This will never be ``None``.
        :rtype: HLL
        """
        from python_hll.hllutil import HLLUtil
        schema_version = SerializationUtil.get_schema_version(bytes)
        metadata = schema_version.read_metadata(bytes)

        type = metadata.hll_type()
        reg_width = metadata.register_width()
        log_2m = metadata.register_count_log2()
        sparseon = metadata.sparse_enabled()

        expthresh = 0
        if metadata.explicit_auto():
            expthresh = -1
        elif metadata.explicit_off():
            expthresh = 0
        else:
            # NOTE: take into account that the postgres-compatible constructor
            # subtracts one before taking a power of two.
            expthresh = metadata.log2_explicit_cutoff() + 1

        hll = HLL(log_2m, reg_width, expthresh, sparseon, type)

        # Short-circuit on empty, which needs no other deserialization.
        if type == HLLType.EMPTY:
            return hll

        word_length = 0
        if type == HLLType.EXPLICIT:
            word_length = HLLUtil.LONG_BIT_LENGTH  # 64 for both java and python

        elif type == HLLType.SPARSE:
            word_length = hll._short_word_length

        elif type == HLLType.FULL:
            word_length = hll._regwidth

        else:
            raise Exception('Unsupported HLL type: {}'.format(type))

        deserializer = schema_version.get_deserializer(type, word_length,
                                                       bytes)
        if type == HLLType.EXPLICIT:
            # NOTE:  This should not exceed expthresh and this will always
            #        be exactly the number of words that were encoded,
            #        because the word length is at least a byte wide.
            # SEE:   BigEndianAscendingWordDeserializer.total_word_count()
            for i in range(deserializer.total_word_count()):
                hll._explicit_storage.add(deserializer.read_word())

        elif type == HLLType.SPARSE:
            # NOTE:  If the short_word_length were smaller than 8 bits
            #        (1 byte) there would be a possibility (because of
            #        padding arithmetic) of having one or more extra
            #        registers read. However, this is not relevant as the
            #        extra registers will be all zeroes, which are ignored
            #        in the sparse representation.
            for i in range(deserializer.total_word_count()):
                short_word = deserializer.read_word()

                register_value = BitUtil.to_signed_byte(short_word
                                                        & hll._value_mask)
                # Only set non-zero registers.
                if register_value != 0:
                    register_key = int(
                        BitUtil.unsigned_right_shift_long(
                            short_word, hll._regwidth))
                    hll._sparse_probabilistic_storage[
                        register_key] = register_value

        elif type == HLLType.FULL:
            # NOTE:  Iteration is done using m (register count) and NOT
            #        deserializer.total_word_count() because regwidth may be
            #        less than 8 and as such the padding on the 'last' byte
            #        may be larger than regwidth, causing an extra register
            #        to be read.
            # SEE: BigEndianAscendingWordDeserializer.total_word_count()
            for i in range(hll._m):
                hll._probabilistic_storage.set_register(
                    i, deserializer.read_word())

        else:
            raise Exception('Unsupported HLL type: {}'.format(type))

        return hll
Example #15
0
            return 0


# Precomputed ``twoToL`` values indexed by a linear combination of
# ``regwidth`` and ``log2m``.
#
# The array is one-dimensional and can be accessed by using index
# ``(REG_WIDTH_INDEX_MULTIPLIER * regwidth) + log2m``
# for ``regwidth`` and ``log2m`` between the specified
# ``HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM`` constants.
#
# See ``large_estimator()``.
# See ``large_estimator_cutoff()``.
# See `Blog post with section on 2^L
# <http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/>`_
TWO_TO_L = [0.0] * (HLL.MAXIMUM_REGWIDTH_PARAM +
                    1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)
for reg_width in range(HLL.MINIMUM_REGWIDTH_PARAM,
                       HLL.MAXIMUM_REGWIDTH_PARAM + 1):
    for log2m in range(HLL.MINIMUM_LOG2M_PARAM, HLL.MAXIMUM_LOG2M_PARAM + 1):
        max_register_value = BitUtil.left_shift_int(1, reg_width) - 1

        # Since 1 is added to p(w) in the insertion algorithm, only
        # (maxRegisterValue - 1) bits are inspected hence the hash
        # space is one power of two smaller.
        pw_bits = max_register_value - 1
        total_bits = pw_bits + log2m
        two_to_l = 2**total_bits
        TWO_TO_L[(HLLUtil.REG_WIDTH_INDEX_MULTIPLIER * reg_width) +
                 log2m] = two_to_l
Example #16
0
def test_unsigned_right_shift_byte2():
    assert BitUtil.unsigned_right_shift_byte(-1, 0) == -1
Example #17
0
def test_unsigned_right_shift_long2():
    assert BitUtil.unsigned_right_shift_long(-1, 0) == -1
Example #18
0
def test_unsigned_right_shift_int2():
    assert BitUtil.unsigned_right_shift_int(-1, 0) == -1
Example #19
0
def test_left_shift_byte():
    assert BitUtil.left_shift_byte(128, 3) == -1024
Example #20
0
def test_left_shift_int():
    assert BitUtil.left_shift_int(128, 3) == 1024
Example #21
0
def test_left_shift_long_3():
    assert BitUtil.left_shift_long(128, 3) == 1024
Example #22
0
def test_left_shift_long_2():
    assert BitUtil.left_shift_long(214748364, 8) == 54975581184
Example #23
0
def test_left_shift_long_1():
    assert BitUtil.left_shift_long(72057594037927935, 8) == -256
Example #24
0
    def from_bytes(cls, bytes):
        """
        Deserializes the HLL (in ``toBytes()`` format) serialized
        into ``bytes``.

        :param list bytes: the serialized bytes of new HLL
        :returns: the deserialized HLL. This will never be ``None``.
        :rtype: HLL
        """
        from python_hll.hllutil import HLLUtil
        schema_version = SerializationUtil.get_schema_version(bytes)
        metadata = schema_version.read_metadata(bytes)

        type = metadata.hll_type()
        reg_width = metadata.register_width()
        log_2m = metadata.register_count_log2()
        sparseon = metadata.sparse_enabled()

        expthresh = 0
        if metadata.explicit_auto():
            expthresh = -1
        elif metadata.explicit_off():
            expthresh = 0
        else:
            # NOTE: take into account that the postgres-compatible constructor
            # subtracts one before taking a power of two.
            expthresh = metadata.log2_explicit_cutoff() + 1

        hll = HLL(log_2m, reg_width, expthresh, sparseon, type)

        # Short-circuit on empty, which needs no other deserialization.
        if type == HLLType.EMPTY:
            return hll

        word_length = 0
        if type == HLLType.EXPLICIT:
            word_length = HLLUtil.LONG_BIT_LENGTH  # 64 for both java and python

        elif type == HLLType.SPARSE:
            word_length = hll._short_word_length

        elif type == HLLType.FULL:
            word_length = hll._regwidth

        else:
            raise Exception('Unsupported HLL type: {}'.format(type))

        deserializer = schema_version.get_deserializer(type, word_length,
                                                       bytes)
        if type == HLLType.EXPLICIT:
            # NOTE:  This should not exceed expthresh and this will always
            #        be exactly the number of words that were encoded,
            #        because the word length is at least a byte wide.
            # SEE:   BigEndianAscendingWordDeserializer.total_word_count()
            for i in range(deserializer.total_word_count()):
                hll._explicit_storage.add(deserializer.read_word())

        elif type == HLLType.SPARSE:
            # NOTE:  If the short_word_length were smaller than 8 bits
            #        (1 byte) there would be a possibility (because of
            #        padding arithmetic) of having one or more extra
            #        registers read. However, this is not relevant as the
            #        extra registers will be all zeroes, which are ignored
            #        in the sparse representation.
            for i in range(deserializer.total_word_count()):
                short_word = deserializer.read_word()

                register_value = BitUtil.to_signed_byte(short_word
                                                        & hll._value_mask)
                # Only set non-zero registers.
                if register_value != 0:
                    register_key = int(
                        BitUtil.unsigned_right_shift_long(
                            short_word, hll._regwidth))
                    hll._sparse_probabilistic_storage[
                        register_key] = register_value

        elif type == HLLType.FULL:
            # NOTE:  Iteration is done using m (register count) and NOT
            #        deserializer.total_word_count() because regwidth may be
            #        less than 8 and as such the padding on the 'last' byte
            #        may be larger than regwidth, causing an extra register
            #        to be read.
            # SEE: BigEndianAscendingWordDeserializer.total_word_count()
            for i in range(hll._m):
                hll._probabilistic_storage.set_register(
                    i, deserializer.read_word())

        else:
            raise Exception('Unsupported HLL type: {}'.format(type))

        return hll
Example #25
0
def test_unsigned_right_shift_long():
    assert BitUtil.unsigned_right_shift_long(-100, 1) == 9223372036854775758
Example #26
0
    def to_bytes(self,
                 schema_version=SerializationUtil.DEFAULT_SCHEMA_VERSION):
        """
        Serializes the HLL to an array of bytes in correspondence with the format
        of the default schema version, ``SerializationUtil.DEFAULT_SCHEMA_VERSION``.

        :param SchemaVersion schema_version: the schema version dictating the serialization format
        :returns: the array of bytes representing the HLL. This will never be
                  ``None`` or empty.
        :rtype: list
        """
        from python_hll.hllutil import HLLUtil
        if self._type == HLLType.EMPTY:
            byte_array_length = schema_version.padding_bytes(self._type)
            byte_array = [0] * byte_array_length

        elif self._type == HLLType.EXPLICIT:
            serializer = schema_version.get_serializer(
                self._type, HLLUtil.LONG_BIT_LENGTH,
                len(self._explicit_storage))

            values = list(self._explicit_storage)
            values = sorted(values)
            for value in values:
                serializer.write_word(value)

            byte_array = serializer.get_bytes()

        elif self._type == HLLType.SPARSE:
            serializer = schema_version.get_serializer(
                self._type, self._short_word_length,
                len(self._sparse_probabilistic_storage))

            indices = self._sparse_probabilistic_storage.keys()
            indices = sorted(indices)

            for register_index in indices:
                register_value = self._sparse_probabilistic_storage.get(
                    register_index, 0)

                # pack index and value into "short word"
                short_word = BitUtil.left_shift_int(
                    register_index, self._regwidth) | register_value
                serializer.write_word(short_word)

            byte_array = serializer.get_bytes()

        elif self._type == HLLType.FULL:
            serializer = schema_version.get_serializer(self._type,
                                                       self._regwidth, self._m)
            self._probabilistic_storage.get_register_contents(serializer)

            byte_array = serializer.get_bytes()

        else:
            raise Exception('Unsupported HLL type: {}'.format(self._type))

        # no use of it if any _explicit_off or _explicit_auto is true
        log2_explicit_threshold = 0
        if not self._explicit_auto | self._explicit_off:
            log2_explicit_threshold = int(
                NumberUtil.log2(self._explicit_threshold))

        metadata = HLLMetadata(schema_version.schema_version_number(),
                               self._type, self._log2m, self._regwidth,
                               log2_explicit_threshold, self._explicit_off,
                               self._explicit_auto, not self._sparse_off)
        schema_version.write_metadata(byte_array, metadata)

        return byte_array
Example #27
0
def test_to_signed_byte():
    for unsigned_int, signed_int in UNSIGNED_TO_SIGNED_INTEGERS.items():
        assert signed_int == BitUtil.to_signed_byte(unsigned_int)
Example #28
0
    def __init__(self,
                 log2m,
                 regwidth,
                 expthresh=-1,
                 sparseon=True,
                 type=HLLType.EMPTY):
        """
        NOTE: Arguments here are named and structured identically to those in the
              PostgreSQL implementation, which can be found
              `here <https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning>`_.

        :param log2m: log-base-2 of the number of registers used in the HyperLogLog
               algorithm. Must be at least 4 and at most 30.
        :type log2m: int
        :param regwidth: number of bits used per register in the HyperLogLog
               algorithm. Must be at least 1 and at most 8.
        :type regwidth: int
        :param expthresh: tunes when the ``HLLType.EXPLICIT`` to
               ``HLLType.SPARSE`` promotion occurs,
               based on the set's cardinality. Must be at least -1 and at most 18.
               +-----------+--------------------------------------------------------------------------------+
               | expthresh | Meaning                                                                        |
               +===========+================================================================================+
               | -1        | Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode) |
               +-----------+--------------------------------------------------------------------------------+
               | 0         | Skip ``EXPLICIT`` representation in hierarchy.                                 |
               +-----------+--------------------------------------------------------------------------------+
               | 1-18      | Promote at 2:sup:`expthresh - 1` cardinality                                   |
               +-----------+--------------------------------------------------------------------------------+
        :type expthresh: int
        :param sparseon: Flag indicating if the ``HLLType.SPARSE``
               representation should be used.
        :type sparseon: boolean
        :param type: the type in the promotion hierarchy which this instance should
               start at. This cannot be ``None``.
        :type type: HLLType
        """
        from python_hll.hllutil import HLLUtil

        self._log2m = log2m
        if log2m < HLL.MINIMUM_LOG2M_PARAM or log2m > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
            raise Exception("'log2m' must be at least " +
                            str(HLL.MINIMUM_LOG2M_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_LOG2M_PARAM) + " (was: " +
                            str(log2m) + ")")

        self._regwidth = regwidth
        if regwidth < HLL.MINIMUM_REGWIDTH_PARAM or regwidth > HLL.MAXIMUM_REGWIDTH_PARAM:
            raise Exception("'regwidth' must be at least " +
                            str(HLL.MINIMUM_REGWIDTH_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_REGWIDTH_PARAM) + " (was: " +
                            str(regwidth) + ")")

        self._m = BitUtil.left_shift_int(1, log2m)
        self._m_bits_mask = self._m - 1
        self._value_mask = BitUtil.left_shift_int(1, regwidth) - 1
        self._pw_max_mask = HLLUtil.pw_max_mask(regwidth)
        self._alpha_m_squared = HLLUtil.alpha_m_squared(self._m)
        self._small_estimator_cutoff = HLLUtil.small_estimator_cutoff(self._m)
        self._large_estimator_cutoff = HLLUtil.large_estimator_cutoff(
            log2m, regwidth)

        if expthresh == -1:
            self._explicit_auto = True
            self._explicit_off = False

            # NOTE:  This math matches the size calculation in the PostgreSQL impl.
            full_representation_size = floor((self._regwidth * self._m + 7) /
                                             8)  # round up to next whole byte
            num_longs = floor(full_representation_size /
                              8)  # integer division to round down

            if num_longs > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
                self._explicit_threshold = HLL.MAXIMUM_EXPLICIT_THRESHOLD
            else:
                self._explicit_threshold = num_longs
        elif expthresh == 0:
            self._explicit_auto = False
            self._explicit_off = True
            self._explicit_threshold = 0
        elif 0 < expthresh <= HLL.MAXIMUM_EXPTHRESH_PARAM:
            self._explicit_auto = False
            self._explicit_off = False
            self._explicit_threshold = BitUtil.left_shift_int(
                1, (expthresh - 1))
        else:
            raise Exception("'expthresh' must be at least " +
                            str(HLL.MINIMUM_EXPTHRESH_PARAM) +
                            " and at most " +
                            str(HLL.MAXIMUM_EXPTHRESH_PARAM) + " (was: " +
                            str(expthresh) + ")")

        self._short_word_length = regwidth + log2m
        self._sparse_off = not sparseon
        if self._sparse_off:
            self._sparse_threshold = 0
        else:
            # TODO improve this cutoff to include the cost overhead of members/objects
            largest_pow_2_less_than_cutoff = int(
                NumberUtil.log2(
                    (self._m * self._regwidth) / self._short_word_length))
            self._sparse_threshold = BitUtil.left_shift_int(
                1, largest_pow_2_less_than_cutoff)

        self._initialize_storage(type)
Example #29
0
def test_unsigned_right_shift_byte():
    assert BitUtil.unsigned_right_shift_byte(-100, 1) == 2147483598