Ejemplo n.º 1
0
    def _full_probabilistic_algorithm_cardinality(self):
        """
        Computes the exact cardinality value returned by the HLL algorithm when
        represented as a ``HLLType.FULL`` HLL. Kept separate from ``cardinality()`` for testing purposes.
        type must be ``HLLType.FULL``.

        :rtype: float
        """
        from python_hll.hllutil import HLLUtil
        # for performance
        m = self._m
        # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        # 'j'th register value
        sum = 0
        number_of_zeroes = 0  # "V" in the paper
        iterator = self._probabilistic_storage.register_iterator()
        for register in iterator:
            sum += 1.0 / BitUtil.left_shift_long(1, register)
            if register == 0:
                number_of_zeroes += 1
        # apply the estimate and correction to the indicator function
        estimator = self._alpha_m_squared / sum
        if number_of_zeroes != 0 and (estimator <
                                      self._small_estimator_cutoff):
            return HLLUtil.small_estimator(m, number_of_zeroes)
        elif estimator <= self._large_estimator_cutoff:
            return estimator
        else:
            return HLLUtil.large_estimator(self._log2m, self._regwidth,
                                           estimator)
Ejemplo n.º 2
0
    def _sparse_probabilistic_algorithm_cardinality(self):
        """
        Computes the exact cardinality value returned by the HLL algorithm when
        represented as a ``HLLType.SPARSE`` HLL. Kept
        separate from ``cardinality()`` for testing purposes. ``type``
        must be ``HLLType.SPARSE``.

        :returns: the exact, unrounded cardinality given by the HLL algorithm
        :rtype: float
        """
        from python_hll.hllutil import HLLUtil
        m = self._m

        # compute the "indicator function" -- sum(2^(-M[j])) where M[j] is the
        # 'j'th register value
        indicator_function = 0.0
        number_of_zeroes = 0  # "V" in the paper
        for j in range(m):
            register = self._sparse_probabilistic_storage.get(j, 0)

            indicator_function += 1.0 / BitUtil.left_shift_long(1, register)
            if register == 0:
                number_of_zeroes += 1

        # apply the estimate and correction to the indicator function
        estimator = self._alpha_m_squared / indicator_function
        if number_of_zeroes != 0 and estimator < self._small_estimator_cutoff:
            return HLLUtil.small_estimator(m, number_of_zeroes)
        elif estimator <= self._large_estimator_cutoff:
            return estimator
        else:
            return HLLUtil.large_estimator(self._log2m, self._regwidth,
                                           estimator)
Ejemplo n.º 3
0
def test_large_range_smoke():
    """
    Smoke test for ``HLL.cardinality()`` and the proper use of the large
    range correction.
    """
    log2m = 12
    regwidth = 5
    # regwidth = 5, so hash space is
    # log2m + (2^5 - 1 - 1), so L = log2m + 30
    L = log2m + 30
    m = BitUtil.left_shift_int(1, log2m)
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    register_value = 31  # chosen to ensure large correction kicks in
    for i in range(0, m):
        hll.add_raw(
            probabilistic_test_util.construct_hll_value(
                log2m, i, register_value))

    cardinality = hll.cardinality()

    # Simplified estimator when all registers take same value: alpha / (m/2^val)
    estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))

    # Assert conditions for large range

    assert estimator > (2**L) / 30

    # Large range correction: -2^L * log(1 - E/2^L)
    try:
        expected = ceil(-1.0 * (2**L) * log(1.0 - estimator / (2**L)))
    except ValueError:
        expected = 0
    assert cardinality == expected
Ejemplo n.º 4
0
def test_normal_range_smoke():
    """
    Smoke test for ``HLL.cardinality()`` and the proper use of the
    uncorrected estimator.
    """
    log2m = 11
    regwidth = 5

    # regwidth = 5, so hash space is
    # log2m + (2^5 - 1 - 1), so L = log2m + 30
    L = log2m + 30
    m = BitUtil.left_shift_int(1, log2m)
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    # all registers at 'medium' value
    register_value = 7  # chosen to ensure neither correction kicks in
    for i in range(0, m):
        hll.add_raw(
            probabilistic_test_util.construct_hll_value(
                log2m, i, register_value))

    cardinality = hll.cardinality()

    # Simplified estimator when all registers take same value: alpha / (m/2^val)
    estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))

    assert estimator <= (2**L) / 30
    assert estimator > (5 * m / 2)

    expected = ceil(estimator)
    assert cardinality == expected
Ejemplo n.º 5
0
def test_large_estimator_cutoff():
    """
    Tests that ``HLLUtil.largeEstimatorCutoff()`` is the same
    as a trivial implementation.
    """
    for log2m in range(HLL.MINIMUM_LOG2M_PARAM + 1, HLL.MAXIMUM_LOG2M_PARAM + 1):
        for regWidth in range(HLL.MINIMUM_REGWIDTH_PARAM + 1, HLL.MINIMUM_REGWIDTH_PARAM + 1):
            cutoff = HLLUtil.large_estimator_cutoff(log2m, regWidth)
            """
            See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/)
            and original paper (Fig. 3) for information on 2^L and
            large range correction cutoff.
            """
            expected = (regWidth ** regWidth) - (2 + log2m) / 30.0
            assert cutoff == expected
Ejemplo n.º 6
0
    def __init__(self,
                 log2m,
                 regwidth,
                 expthresh=-1,
                 sparseon=True,
                 type=HLLType.EMPTY):
        """
        NOTE: Arguments here are named and structured identically to those in the
              PostgreSQL implementation, which can be found
              `here <https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning>`_.

        :param log2m: log-base-2 of the number of registers used in the HyperLogLog
               algorithm. Must be at least 4 and at most 30.
        :type log2m: int
        :param regwidth: number of bits used per register in the HyperLogLog
               algorithm. Must be at least 1 and at most 8.
        :type regwidth: int
        :param expthresh: tunes when the ``HLLType.EXPLICIT`` to
               ``HLLType.SPARSE`` promotion occurs,
               based on the set's cardinality. Must be at least -1 and at most 18.
               +-----------+--------------------------------------------------------------------------------+
               | expthresh | Meaning                                                                        |
               +===========+================================================================================+
               | -1        | Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode) |
               +-----------+--------------------------------------------------------------------------------+
               | 0         | Skip ``EXPLICIT`` representation in hierarchy.                                 |
               +-----------+--------------------------------------------------------------------------------+
               | 1-18      | Promote at 2:sup:`expthresh - 1` cardinality                                   |
               +-----------+--------------------------------------------------------------------------------+
        :type expthresh: int
        :param sparseon: Flag indicating if the ``HLLType.SPARSE``
               representation should be used.
        :type sparseon: boolean
        :param type: the type in the promotion hierarchy which this instance should
               start at. This cannot be ``None``.
        :type type: HLLType
        """
        from python_hll.hllutil import HLLUtil

        self._log2m = log2m
        if log2m < HLL.MINIMUM_LOG2M_PARAM or log2m > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
            raise Exception("'log2m' must be at least " +
                            str(HLL.MINIMUM_LOG2M_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_LOG2M_PARAM) + " (was: " +
                            str(log2m) + ")")

        self._regwidth = regwidth
        if regwidth < HLL.MINIMUM_REGWIDTH_PARAM or regwidth > HLL.MAXIMUM_REGWIDTH_PARAM:
            raise Exception("'regwidth' must be at least " +
                            str(HLL.MINIMUM_REGWIDTH_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_REGWIDTH_PARAM) + " (was: " +
                            str(regwidth) + ")")

        self._m = BitUtil.left_shift_int(1, log2m)
        self._m_bits_mask = self._m - 1
        self._value_mask = BitUtil.left_shift_int(1, regwidth) - 1
        self._pw_max_mask = HLLUtil.pw_max_mask(regwidth)
        self._alpha_m_squared = HLLUtil.alpha_m_squared(self._m)
        self._small_estimator_cutoff = HLLUtil.small_estimator_cutoff(self._m)
        self._large_estimator_cutoff = HLLUtil.large_estimator_cutoff(
            log2m, regwidth)

        if expthresh == -1:
            self._explicit_auto = True
            self._explicit_off = False

            # NOTE:  This math matches the size calculation in the PostgreSQL impl.
            full_representation_size = floor((self._regwidth * self._m + 7) /
                                             8)  # round up to next whole byte
            num_longs = floor(full_representation_size /
                              8)  # integer division to round down

            if num_longs > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
                self._explicit_threshold = HLL.MAXIMUM_EXPLICIT_THRESHOLD
            else:
                self._explicit_threshold = num_longs
        elif expthresh == 0:
            self._explicit_auto = False
            self._explicit_off = True
            self._explicit_threshold = 0
        elif 0 < expthresh <= HLL.MAXIMUM_EXPTHRESH_PARAM:
            self._explicit_auto = False
            self._explicit_off = False
            self._explicit_threshold = BitUtil.left_shift_int(
                1, (expthresh - 1))
        else:
            raise Exception("'expthresh' must be at least " +
                            str(HLL.MINIMUM_EXPTHRESH_PARAM) +
                            " and at most " +
                            str(HLL.MAXIMUM_EXPTHRESH_PARAM) + " (was: " +
                            str(expthresh) + ")")

        self._short_word_length = regwidth + log2m
        self._sparse_off = not sparseon
        if self._sparse_off:
            self._sparse_threshold = 0
        else:
            # TODO improve this cutoff to include the cost overhead of members/objects
            largest_pow_2_less_than_cutoff = int(
                NumberUtil.log2(
                    (self._m * self._regwidth) / self._short_word_length))
            self._sparse_threshold = BitUtil.left_shift_int(
                1, largest_pow_2_less_than_cutoff)

        self._initialize_storage(type)