def test_small_range_smoke(): """ Smoke test for HLL.cardinality() and the proper use of the small range correction. """ log2m = 11 m = BitUtil.left_shift_int(1, log2m) regwidth = 5 # only one register set hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1)) cardinality = hll.cardinality() # Trivially true that small correction conditions hold: one register # set implies zeroes exist, and estimator trivially smaller than 5m/2. # Small range correction: m * log(m/V) expected = ceil(m * log(m / (m - 1))) # # of zeroes assert cardinality == expected # all but one register set hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) for i in range(0, m - 1): hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1)) # Trivially true that small correction conditions hold: all but # one register set implies a zero exists, and estimator trivially # smaller than 5m/2 since it's alpha / ((m-1)/2) cardinality = hll.cardinality() # Small range correction: m * log(m/V) expected = ceil(m * log(m / 1)) # # of zeroes assert cardinality == expected
def test_large_range_smoke(): """ Smoke test for ``HLL.cardinality()`` and the proper use of the large range correction. """ log2m = 12 regwidth = 5 # regwidth = 5, so hash space is # log2m + (2^5 - 1 - 1), so L = log2m + 30 L = log2m + 30 m = BitUtil.left_shift_int(1, log2m) hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) register_value = 31 # chosen to ensure large correction kicks in for i in range(0, m): hll.add_raw( probabilistic_test_util.construct_hll_value( log2m, i, register_value)) cardinality = hll.cardinality() # Simplified estimator when all registers take same value: alpha / (m/2^val) estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value)) # Assert conditions for large range assert estimator > (2**L) / 30 # Large range correction: -2^L * log(1 - E/2^L) try: expected = ceil(-1.0 * (2**L) * log(1.0 - estimator / (2**L))) except ValueError: expected = 0 assert cardinality == expected
def test_normal_range_smoke(): """ Smoke test for ``HLL.cardinality()`` and the proper use of the uncorrected estimator. """ log2m = 11 regwidth = 5 # regwidth = 5, so hash space is # log2m + (2^5 - 1 - 1), so L = log2m + 30 L = log2m + 30 m = BitUtil.left_shift_int(1, log2m) hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) # all registers at 'medium' value register_value = 7 # chosen to ensure neither correction kicks in for i in range(0, m): hll.add_raw( probabilistic_test_util.construct_hll_value( log2m, i, register_value)) cardinality = hll.cardinality() # Simplified estimator when all registers take same value: alpha / (m/2^val) estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value)) assert estimator <= (2**L) / 30 assert estimator > (5 * m / 2) expected = ceil(estimator) assert cardinality == expected
def new_hll(type): """ Shortcut for testing constructor, which uses the constants defined at the top of the file as default parameters. :returns: a new ``HLL`` of specified type, which uses the parameters ``LOG2M`` ``REGWIDTH``, ``EXPLICIT_THRESHOLD`` and ``SPARSE_THRESHOLD`` specified above. """ return HLL.create_for_testing(LOG2M, REGWIDTH, EXPLICIT_THRESHOLD, SPARSE_THRESHOLD, type)
def run_one_test(tokens): log2m_list = [] reg_width_list = [] cardinality_list = [] num_trials = (LOG2M_MAX - LOG2M_MIN + 1) * (REG_WIDTH_MAX - REG_WIDTH_MIN + 1) trial = 1 for log2m in range(LOG2M_MIN, LOG2M_MAX + 1): for reg_width in range(REG_WIDTH_MIN, REG_WIDTH_MAX + 1): print("Trial" + " " + str(trial) + " / " + str(num_trials)) print(reg_width) hll = HLL(log2m, reg_width) for token in tokens: hashed_value = mmh3.hash(token) hll.add_raw(hashed_value) cardinality = hll.cardinality() log2m_list.append(log2m) reg_width_list.append(reg_width) cardinality_list.append(cardinality) trial += 1 plot(log2m_list, reg_width_list, cardinality_list)
def test_to_from_bytes(): """ Tests ``HLL.to_bytes() and ``HLL.from_bytes(). """ schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION type = HLLType.EXPLICIT padding = schema_version.padding_bytes(type) bytes_per_word = 8 # Should work on an empty set hll = new_hll(128) bytes = hll.to_bytes(schema_version) assert len(bytes) == padding # no elements, just padding in_hll = HLL.from_bytes(bytes) assert_elements_equal(hll, in_hll) # Should work on a partially filled set hll = new_hll(128) for i in range(0, 3): hll.add_raw(i) bytes = hll.to_bytes(schema_version) assert len(bytes) == padding + bytes_per_word * 3 in_hll = HLL.from_bytes(bytes) assert_elements_equal(hll, in_hll) # Should work on a full set explicit_threshold = 128 hll = new_hll(explicit_threshold) for i in range(0, explicit_threshold): hll.add_raw(27 + i) bytes = hll.to_bytes(schema_version) assert len(bytes) == padding + bytes_per_word * explicit_threshold in_hll = HLL.from_bytes(bytes) assert_elements_equal(hll, in_hll)
def new_hll(explicit_threshold): """ Builds a ``HLLType.EXPLICIT`` ``HLL`` instance with the specified explicit threshold. :param explicit_threshold: explicit threshold to use for the constructed ``HLL``. This must be greater than zero. :type explicit_threshold: int :returns: A default-sized ``HLLType.EXPLICIT`` empty ``HLL`` instance. This will never be ``None``. :rtype: HLL """ return HLL.create_for_testing(11, 5, explicit_threshold, 256, HLLType.EXPLICIT)
def test_to_from_bytes(): log2m = 11 # arbitrary regwidth = 5 schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION type = HLLType.FULL padding = schema_version.padding_bytes(type) data_byte_count = probabilistic_test_util.get_required_bytes( regwidth, BitUtil.left_shift_int(1, log2m)) # aka 2^log2m = m expected_byte_count = padding + data_byte_count # Should work on an empty element hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) bytes = hll.to_bytes(schema_version) # assert output length is correct assert len(bytes) == expected_byte_count in_hll = HLL.from_bytes(bytes) assert_elements_equal(hll, in_hll) # Should work on a partially filled element hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) for i in range(0, 3): raw_value = probabilistic_test_util.construct_hll_value( log2m, i, (i + 9)) hll.add_raw(raw_value) bytes = hll.to_bytes(schema_version) assert len(bytes) == expected_byte_count in_hll = HLL.from_bytes(bytes) # assert register values correct assert_elements_equal(hll, in_hll) # Should work on a full set hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) for i in range(0, BitUtil.left_shift_int(1, log2m)): raw_value = probabilistic_test_util.construct_hll_value( log2m, i, (i % 9) + 1) hll.add_raw(raw_value) bytes = hll.to_bytes(schema_version) # assert output length is correct assert len(bytes) == expected_byte_count in_hll = HLL.from_bytes(bytes) # assert register values correct assert_elements_equal(hll, in_hll)
def test_clear(): """ Tests HLL.clear(). """ regwidth = 5 log2m = 4 # 16 registers per counter m = BitUtil.left_shift_int(1, log2m) hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) bit_vector = hll._probabilistic_storage for i in range(0, m): bit_vector.set_register(i, i) hll.clear() for i in range(0, m): assert bit_vector.get_register(i) == 0 # default value of register
def test_promotion(): """ Tests promotion to ``HLLType.SPARSE`` and ``HLLType.FULL``. """ explicit_threshold = 128 hll = HLL.create_for_testing(11, 5, explicit_threshold, 256, HLLType.EXPLICIT) for i in range(0, explicit_threshold + 1): hll.add_raw(i) assert hll.get_type() == HLLType.SPARSE hll = HLL(11, 5, 4, False, HLLType.EXPLICIT) # expthresh=4 => explicit_threshold=8 for i in range(0, 9): hll.add_raw(i) assert hll.get_type() == HLLType.FULL
def test_register_value(): """ Tests the bounds on a register's value for a given raw input value. """ log2m = 4 # small enough to make testing easy (add_raw() shifts by one byte) # register width 4 (the minimum size) regwidth = 4 hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) bit_vector = hll._probabilistic_storage # lower-bounds of the register hll.add_raw(0x000000000000001) # 'j'=1 assert bit_vector.get_register(1) == 0 hll.add_raw(0x0000000000000012) # 'j'=2 assert bit_vector.get_register(2) == 1 hll.add_raw(0x0000000000000023) # 'j'=3 assert bit_vector.get_register(3) == 2 hll.add_raw(0x0000000000000044) # 'j'=4 assert bit_vector.get_register(4) == 3 hll.add_raw(0x0000000000000085) # 'j'=5 assert bit_vector.get_register(5) == 4 # upper-bounds of the register # NOTE: bear in mind that BitVector itself does ensure that # overflow of a register is prevented hll.add_raw(0x0000000000010006) # 'j'=6 assert bit_vector.get_register(6) == 13 hll.add_raw(0x0000000000020007) # 'j'=7 assert bit_vector.get_register(7) == 14 hll.add_raw(0x0000000000040008) # 'j'=8 assert bit_vector.get_register(8) == 15 hll.add_raw(0x0000000000080009) # 'j'=9 assert bit_vector.get_register(9) == 15 # overflow # sanity checks to ensure that no other bits above the lowest-set # bit matters # NOTE: same as case 'j = 6' above hll.add_raw(0x000000000003000A) # 'j'=10 assert bit_vector.get_register(10) == 13 hll.add_raw(0x000000000011000B) # 'j'=11 assert bit_vector.get_register(11) == 13 # ------------------------------------------------------------ # register width 5 regwidth = 5 hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL) bit_vector = hll._probabilistic_storage # lower-bounds of the register hll.add_raw(0x0000000000000001) # 'j'=1 assert bit_vector.get_register(1) == 0 hll.add_raw(0x0000000000000012) # 'j'=2 assert bit_vector.get_register(2) == 1 hll.add_raw(0x0000000000000023) # 'j'=3 assert bit_vector.get_register(3) == 2 hll.add_raw(0x0000000000000044) # 'j'=4 assert bit_vector.get_register(4) == 3 hll.add_raw(0x0000000000000085) # 'j'=5 assert bit_vector.get_register(5) == 4 # upper-bounds of the register # NOTE: bear in mind that BitVector itself does ensure that # overflow of a register is prevented hll.add_raw(0x0000000100000006) # 'j'=6 assert bit_vector.get_register(6) == 29 hll.add_raw(0x0000000200000007) # 'j'=7 assert bit_vector.get_register(7) == 30 hll.add_raw(0x0000000400000008) # 'j'=8 assert bit_vector.get_register(8) == 31 hll.add_raw(0x0000000800000009) # 'j'=9 assert bit_vector.get_register(9) == 31 # overflow
def assert_cardinality(hll_type, items, fastonly): # NOTE: log2m<=16 was chosen as the max log2m parameter so that the test # completes in a reasonable amount of time. Not much is gained by # testing larger values - there are no more known serialization # related edge cases that appear as log2m gets even larger. log2m_range = range(HLL.MINIMUM_LOG2M_PARAM, 16 + 1) regw_range = range(HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM + 1) expthr_range = range(HLL.MINIMUM_EXPTHRESH_PARAM, HLL.MAXIMUM_EXPTHRESH_PARAM + 1) if fastonly: log2m_range = (HLL.MINIMUM_LOG2M_PARAM, 16) regw_range = (HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM) expthr_range = (HLL.MINIMUM_EXPTHRESH_PARAM, HLL.MAXIMUM_EXPTHRESH_PARAM) for log2m in log2m_range: for regw in regw_range: for expthr in expthr_range: for sparse in [True, False]: hll = HLL(log2m, regw, expthr, sparse, hll_type) for item in items: hll.add_raw(item) copy = HLL.from_bytes(hll.to_bytes()) assert copy.cardinality() == hll.cardinality() assert copy.get_type() == hll.get_type() assert copy.to_bytes() == hll.to_bytes() clone = deepcopy(hll) assert clone.cardinality() == hll.cardinality() assert clone.get_type() == hll.get_type() assert clone.to_bytes() == hll.to_bytes() sys.stdout.write('.') sys.stdout.flush()
import sys import argparse from python_hll.hll import HLL import mmh3 parser = argparse.ArgumentParser( description='Estimate unique words in text file(s)') parser.add_argument('file', metavar='F', type=str, help="csv") parser.add_argument('log2m', metavar='L', type=int) parser.add_argument('reg_width', metavar='R', type=int) a = parser.parse_args() hll = HLL(a.log2m, a.reg_width) with open(a.file, 'r') as f: for token in f: hashed_value = mmh3.hash(token) hll.add_raw(hashed_value) print(hll.cardinality())
def string_to_hll(s): """ Converts a string (with \\x) to an HLL. """ s = s[2:] return HLL.from_bytes(NumberUtil.from_hex(s, 0, len(s)))