Beispiel #1
0
    def register_bit_size(cls, expected_unique_elements):
        """
        Computes the bit-width of HLL registers necessary to estimate a set of
        the specified cardinality.

        :param long expected_unique_elements: an upper bound on the number of unique
               elements that are expected.  This must be greater than zero.
        :returns: a register size in bits (i.e. ``log2(log2(n))``)
        :rtype: int
        """
        return max(HLL.MINIMUM_REGWIDTH_PARAM,
                   NumberUtil.log2(NumberUtil.log2(expected_unique_elements)))
Beispiel #2
0
    def __init__(self,
                 log2m,
                 regwidth,
                 expthresh=-1,
                 sparseon=True,
                 type=HLLType.EMPTY):
        """
        NOTE: Arguments here are named and structured identically to those in the
              PostgreSQL implementation, which can be found
              `here <https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning>`_.

        :param log2m: log-base-2 of the number of registers used in the HyperLogLog
               algorithm. Must be at least 4 and at most 30.
        :type log2m: int
        :param regwidth: number of bits used per register in the HyperLogLog
               algorithm. Must be at least 1 and at most 8.
        :type regwidth: int
        :param expthresh: tunes when the ``HLLType.EXPLICIT`` to
               ``HLLType.SPARSE`` promotion occurs,
               based on the set's cardinality. Must be at least -1 and at most 18.
               +-----------+--------------------------------------------------------------------------------+
               | expthresh | Meaning                                                                        |
               +===========+================================================================================+
               | -1        | Promote at whatever cutoff makes sense for optimal memory usage. ('auto' mode) |
               +-----------+--------------------------------------------------------------------------------+
               | 0         | Skip ``EXPLICIT`` representation in hierarchy.                                 |
               +-----------+--------------------------------------------------------------------------------+
               | 1-18      | Promote at 2:sup:`expthresh - 1` cardinality                                   |
               +-----------+--------------------------------------------------------------------------------+
        :type expthresh: int
        :param sparseon: Flag indicating if the ``HLLType.SPARSE``
               representation should be used.
        :type sparseon: boolean
        :param type: the type in the promotion hierarchy which this instance should
               start at. This cannot be ``None``.
        :type type: HLLType
        """
        from python_hll.hllutil import HLLUtil

        self._log2m = log2m
        if log2m < HLL.MINIMUM_LOG2M_PARAM or log2m > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
            raise Exception("'log2m' must be at least " +
                            str(HLL.MINIMUM_LOG2M_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_LOG2M_PARAM) + " (was: " +
                            str(log2m) + ")")

        self._regwidth = regwidth
        if regwidth < HLL.MINIMUM_REGWIDTH_PARAM or regwidth > HLL.MAXIMUM_REGWIDTH_PARAM:
            raise Exception("'regwidth' must be at least " +
                            str(HLL.MINIMUM_REGWIDTH_PARAM) + " and at most " +
                            str(HLL.MAXIMUM_REGWIDTH_PARAM) + " (was: " +
                            str(regwidth) + ")")

        self._m = BitUtil.left_shift_int(1, log2m)
        self._m_bits_mask = self._m - 1
        self._value_mask = BitUtil.left_shift_int(1, regwidth) - 1
        self._pw_max_mask = HLLUtil.pw_max_mask(regwidth)
        self._alpha_m_squared = HLLUtil.alpha_m_squared(self._m)
        self._small_estimator_cutoff = HLLUtil.small_estimator_cutoff(self._m)
        self._large_estimator_cutoff = HLLUtil.large_estimator_cutoff(
            log2m, regwidth)

        if expthresh == -1:
            self._explicit_auto = True
            self._explicit_off = False

            # NOTE:  This math matches the size calculation in the PostgreSQL impl.
            full_representation_size = floor((self._regwidth * self._m + 7) /
                                             8)  # round up to next whole byte
            num_longs = floor(full_representation_size /
                              8)  # integer division to round down

            if num_longs > HLL.MAXIMUM_EXPLICIT_THRESHOLD:
                self._explicit_threshold = HLL.MAXIMUM_EXPLICIT_THRESHOLD
            else:
                self._explicit_threshold = num_longs
        elif expthresh == 0:
            self._explicit_auto = False
            self._explicit_off = True
            self._explicit_threshold = 0
        elif 0 < expthresh <= HLL.MAXIMUM_EXPTHRESH_PARAM:
            self._explicit_auto = False
            self._explicit_off = False
            self._explicit_threshold = BitUtil.left_shift_int(
                1, (expthresh - 1))
        else:
            raise Exception("'expthresh' must be at least " +
                            str(HLL.MINIMUM_EXPTHRESH_PARAM) +
                            " and at most " +
                            str(HLL.MAXIMUM_EXPTHRESH_PARAM) + " (was: " +
                            str(expthresh) + ")")

        self._short_word_length = regwidth + log2m
        self._sparse_off = not sparseon
        if self._sparse_off:
            self._sparse_threshold = 0
        else:
            # TODO improve this cutoff to include the cost overhead of members/objects
            largest_pow_2_less_than_cutoff = int(
                NumberUtil.log2(
                    (self._m * self._regwidth) / self._short_word_length))
            self._sparse_threshold = BitUtil.left_shift_int(
                1, largest_pow_2_less_than_cutoff)

        self._initialize_storage(type)
Beispiel #3
0
    def to_bytes(self,
                 schema_version=SerializationUtil.DEFAULT_SCHEMA_VERSION):
        """
        Serializes the HLL to an array of bytes in correspondence with the format
        of the default schema version, ``SerializationUtil.DEFAULT_SCHEMA_VERSION``.

        :param SchemaVersion schema_version: the schema version dictating the serialization format
        :returns: the array of bytes representing the HLL. This will never be
                  ``None`` or empty.
        :rtype: list
        """
        from python_hll.hllutil import HLLUtil
        if self._type == HLLType.EMPTY:
            byte_array_length = schema_version.padding_bytes(self._type)
            byte_array = [0] * byte_array_length

        elif self._type == HLLType.EXPLICIT:
            serializer = schema_version.get_serializer(
                self._type, HLLUtil.LONG_BIT_LENGTH,
                len(self._explicit_storage))

            values = list(self._explicit_storage)
            values = sorted(values)
            for value in values:
                serializer.write_word(value)

            byte_array = serializer.get_bytes()

        elif self._type == HLLType.SPARSE:
            serializer = schema_version.get_serializer(
                self._type, self._short_word_length,
                len(self._sparse_probabilistic_storage))

            indices = self._sparse_probabilistic_storage.keys()
            indices = sorted(indices)

            for register_index in indices:
                register_value = self._sparse_probabilistic_storage.get(
                    register_index, 0)

                # pack index and value into "short word"
                short_word = BitUtil.left_shift_int(
                    register_index, self._regwidth) | register_value
                serializer.write_word(short_word)

            byte_array = serializer.get_bytes()

        elif self._type == HLLType.FULL:
            serializer = schema_version.get_serializer(self._type,
                                                       self._regwidth, self._m)
            self._probabilistic_storage.get_register_contents(serializer)

            byte_array = serializer.get_bytes()

        else:
            raise Exception('Unsupported HLL type: {}'.format(self._type))

        # no use of it if any _explicit_off or _explicit_auto is true
        log2_explicit_threshold = 0
        if not self._explicit_auto | self._explicit_off:
            log2_explicit_threshold = int(
                NumberUtil.log2(self._explicit_threshold))

        metadata = HLLMetadata(schema_version.schema_version_number(),
                               self._type, self._log2m, self._regwidth,
                               log2_explicit_threshold, self._explicit_off,
                               self._explicit_auto, not self._sparse_off)
        schema_version.write_metadata(byte_array, metadata)

        return byte_array
def hll_to_string(hll):
    """
    Converts an HLL to a string (with \\x)
    """
    bytes = hll.to_bytes()
    return '\\x' + NumberUtil.to_hex(bytes, 0, len(bytes))
def string_to_hll(s):
    """
    Converts a string (with \\x) to an HLL.
    """
    s = s[2:]
    return HLL.from_bytes(NumberUtil.from_hex(s, 0, len(s)))