def get_register_value(raw_value, log2m): """ Extracts the HLL register value from a raw value. """ substream_value = BitUtil.unsigned_right_shift_long(raw_value, log2m) if substream_value == 0: # The paper does not cover p(0x0), so the special value 0 is used. # 0 is the original initialization value of the registers, so by # doing this the HLL simply ignores it. This is acceptable # because the probability is 1/(2^(2^register_size_in_bits)). p_w = 0 else: p_w = BitUtil.to_signed_byte( min(1 + BitUtil.least_significant_bit(substream_value), 31)) return p_w
def _add_raw_sparse_probabilistic(self, raw_value): """ Adds the raw value to the ``sparseProbabilisticStorage``. ``type`` ``HLLType.SPARSE``. :param long raw_value: the raw value to add to the sparse storage. :rtype: void """ # p(w): position of the least significant set bit (one-indexed) # By contract: p(w) <= 2^(register_value_in_bits) - 1 (the max register value) # # By construction of pw_max_mask (see constructor), # lsb(pw_max_mask) = 2^(register_value_in_bits) - 2, # thus lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) - 2, # thus 1 + lsb(any_long | pw_max_mask) <= 2^(register_value_in_bits) -1. sub_stream_value = BitUtil.unsigned_right_shift_long( raw_value, self._log2m) p_w = None if sub_stream_value == 0: # The paper does not cover p(0x0), so the special value 0 is used. # 0 is the original initialization value of the registers, so by # doing this the multiset simply ignores it. This is acceptable # because the probability is 1/(2^(2^register_size_in_bits)). p_w = 0 else: p_w = BitUtil.to_signed_byte(1 + BitUtil.least_significant_bit( sub_stream_value | self._pw_max_mask)) # Short-circuit if the register is being set to zero, since algorithmically # this corresponds to an "unset" register, and "unset" registers aren't # stored to save memory. (The very reason this sparse implementation # exists.) If a register is set to zero it will break the algorithm_cardinality # code. if p_w == 0: return # NOTE: no +1 as in paper since 0-based indexing j = int(raw_value & self._m_bits_mask) current_value = self._sparse_probabilistic_storage.get(j, 0) if p_w > current_value: self._sparse_probabilistic_storage[j] = p_w
def from_bytes(cls, bytes): """ Deserializes the HLL (in ``toBytes()`` format) serialized into ``bytes``. :param list bytes: the serialized bytes of new HLL :returns: the deserialized HLL. This will never be ``None``. :rtype: HLL """ from python_hll.hllutil import HLLUtil schema_version = SerializationUtil.get_schema_version(bytes) metadata = schema_version.read_metadata(bytes) type = metadata.hll_type() reg_width = metadata.register_width() log_2m = metadata.register_count_log2() sparseon = metadata.sparse_enabled() expthresh = 0 if metadata.explicit_auto(): expthresh = -1 elif metadata.explicit_off(): expthresh = 0 else: # NOTE: take into account that the postgres-compatible constructor # subtracts one before taking a power of two. expthresh = metadata.log2_explicit_cutoff() + 1 hll = HLL(log_2m, reg_width, expthresh, sparseon, type) # Short-circuit on empty, which needs no other deserialization. if type == HLLType.EMPTY: return hll word_length = 0 if type == HLLType.EXPLICIT: word_length = HLLUtil.LONG_BIT_LENGTH # 64 for both java and python elif type == HLLType.SPARSE: word_length = hll._short_word_length elif type == HLLType.FULL: word_length = hll._regwidth else: raise Exception('Unsupported HLL type: {}'.format(type)) deserializer = schema_version.get_deserializer(type, word_length, bytes) if type == HLLType.EXPLICIT: # NOTE: This should not exceed expthresh and this will always # be exactly the number of words that were encoded, # because the word length is at least a byte wide. # SEE: BigEndianAscendingWordDeserializer.total_word_count() for i in range(deserializer.total_word_count()): hll._explicit_storage.add(deserializer.read_word()) elif type == HLLType.SPARSE: # NOTE: If the short_word_length were smaller than 8 bits # (1 byte) there would be a possibility (because of # padding arithmetic) of having one or more extra # registers read. However, this is not relevant as the # extra registers will be all zeroes, which are ignored # in the sparse representation. for i in range(deserializer.total_word_count()): short_word = deserializer.read_word() register_value = BitUtil.to_signed_byte(short_word & hll._value_mask) # Only set non-zero registers. if register_value != 0: register_key = int( BitUtil.unsigned_right_shift_long( short_word, hll._regwidth)) hll._sparse_probabilistic_storage[ register_key] = register_value elif type == HLLType.FULL: # NOTE: Iteration is done using m (register count) and NOT # deserializer.total_word_count() because regwidth may be # less than 8 and as such the padding on the 'last' byte # may be larger than regwidth, causing an extra register # to be read. # SEE: BigEndianAscendingWordDeserializer.total_word_count() for i in range(hll._m): hll._probabilistic_storage.set_register( i, deserializer.read_word()) else: raise Exception('Unsupported HLL type: {}'.format(type)) return hll
def test_to_signed_byte(): for unsigned_int, signed_int in UNSIGNED_TO_SIGNED_INTEGERS.items(): assert signed_int == BitUtil.to_signed_byte(unsigned_int)