Beispiel #1
0
def test_small_range_smoke():
    """
    Smoke test for HLL.cardinality() and the proper use of the
    small range correction.
    """
    log2m = 11
    m = BitUtil.left_shift_int(1, log2m)
    regwidth = 5

    # only one register set
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, 0, 1))
    cardinality = hll.cardinality()

    # Trivially true that small correction conditions hold: one register
    # set implies zeroes exist, and estimator trivially smaller than 5m/2.
    # Small range correction: m * log(m/V)
    expected = ceil(m * log(m / (m - 1)))  # # of zeroes
    assert cardinality == expected

    # all but one register set
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    for i in range(0, m - 1):
        hll.add_raw(probabilistic_test_util.construct_hll_value(log2m, i, 1))

    # Trivially true that small correction conditions hold: all but
    # one register set implies a zero exists, and estimator trivially
    # smaller than 5m/2 since it's alpha / ((m-1)/2)
    cardinality = hll.cardinality()

    # Small range correction: m * log(m/V)
    expected = ceil(m * log(m / 1))  # # of zeroes
    assert cardinality == expected
Beispiel #2
0
def test_large_range_smoke():
    """
    Smoke test for ``HLL.cardinality()`` and the proper use of the large
    range correction.
    """
    log2m = 12
    regwidth = 5
    # regwidth = 5, so hash space is
    # log2m + (2^5 - 1 - 1), so L = log2m + 30
    L = log2m + 30
    m = BitUtil.left_shift_int(1, log2m)
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    register_value = 31  # chosen to ensure large correction kicks in
    for i in range(0, m):
        hll.add_raw(
            probabilistic_test_util.construct_hll_value(
                log2m, i, register_value))

    cardinality = hll.cardinality()

    # Simplified estimator when all registers take same value: alpha / (m/2^val)
    estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))

    # Assert conditions for large range

    assert estimator > (2**L) / 30

    # Large range correction: -2^L * log(1 - E/2^L)
    try:
        expected = ceil(-1.0 * (2**L) * log(1.0 - estimator / (2**L)))
    except ValueError:
        expected = 0
    assert cardinality == expected
Beispiel #3
0
def test_normal_range_smoke():
    """
    Smoke test for ``HLL.cardinality()`` and the proper use of the
    uncorrected estimator.
    """
    log2m = 11
    regwidth = 5

    # regwidth = 5, so hash space is
    # log2m + (2^5 - 1 - 1), so L = log2m + 30
    L = log2m + 30
    m = BitUtil.left_shift_int(1, log2m)
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    # all registers at 'medium' value
    register_value = 7  # chosen to ensure neither correction kicks in
    for i in range(0, m):
        hll.add_raw(
            probabilistic_test_util.construct_hll_value(
                log2m, i, register_value))

    cardinality = hll.cardinality()

    # Simplified estimator when all registers take same value: alpha / (m/2^val)
    estimator = HLLUtil.alpha_m_squared(m) / (m / (2**register_value))

    assert estimator <= (2**L) / 30
    assert estimator > (5 * m / 2)

    expected = ceil(estimator)
    assert cardinality == expected
def new_hll(type):
    """
    Shortcut for testing constructor, which uses the constants defined at
    the top of the file as default parameters.

    :returns: a new ``HLL`` of specified type, which uses the parameters
              ``LOG2M`` ``REGWIDTH``, ``EXPLICIT_THRESHOLD`` and ``SPARSE_THRESHOLD`` specified above.
    """
    return HLL.create_for_testing(LOG2M, REGWIDTH, EXPLICIT_THRESHOLD,
                                  SPARSE_THRESHOLD, type)
Beispiel #5
0
def run_one_test(tokens):
    log2m_list = []
    reg_width_list = []
    cardinality_list = []
    num_trials = (LOG2M_MAX - LOG2M_MIN + 1) * (REG_WIDTH_MAX - REG_WIDTH_MIN +
                                                1)
    trial = 1
    for log2m in range(LOG2M_MIN, LOG2M_MAX + 1):
        for reg_width in range(REG_WIDTH_MIN, REG_WIDTH_MAX + 1):
            print("Trial" + " " + str(trial) + " / " + str(num_trials))
            print(reg_width)
            hll = HLL(log2m, reg_width)
            for token in tokens:
                hashed_value = mmh3.hash(token)
                hll.add_raw(hashed_value)
            cardinality = hll.cardinality()
            log2m_list.append(log2m)
            reg_width_list.append(reg_width)
            cardinality_list.append(cardinality)
            trial += 1
    plot(log2m_list, reg_width_list, cardinality_list)
def test_to_from_bytes():
    """
    Tests ``HLL.to_bytes() and ``HLL.from_bytes().
    """
    schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION
    type = HLLType.EXPLICIT
    padding = schema_version.padding_bytes(type)
    bytes_per_word = 8

    # Should work on an empty set
    hll = new_hll(128)
    bytes = hll.to_bytes(schema_version)
    assert len(bytes) == padding  # no elements, just padding

    in_hll = HLL.from_bytes(bytes)
    assert_elements_equal(hll, in_hll)

    # Should work on a partially filled set
    hll = new_hll(128)
    for i in range(0, 3):
        hll.add_raw(i)

    bytes = hll.to_bytes(schema_version)
    assert len(bytes) == padding + bytes_per_word * 3

    in_hll = HLL.from_bytes(bytes)
    assert_elements_equal(hll, in_hll)

    # Should work on a full set
    explicit_threshold = 128
    hll = new_hll(explicit_threshold)

    for i in range(0, explicit_threshold):
        hll.add_raw(27 + i)

    bytes = hll.to_bytes(schema_version)
    assert len(bytes) == padding + bytes_per_word * explicit_threshold

    in_hll = HLL.from_bytes(bytes)
    assert_elements_equal(hll, in_hll)
def new_hll(explicit_threshold):
    """
    Builds a ``HLLType.EXPLICIT`` ``HLL`` instance with the specified
    explicit threshold.

    :param explicit_threshold: explicit threshold to use for the constructed
           ``HLL``. This must be greater than zero.
    :type explicit_threshold: int
    :returns: A default-sized ``HLLType.EXPLICIT`` empty ``HLL`` instance. This
              will never be ``None``.
    :rtype: HLL
    """
    return HLL.create_for_testing(11, 5, explicit_threshold, 256,
                                  HLLType.EXPLICIT)
Beispiel #8
0
def test_to_from_bytes():
    log2m = 11  # arbitrary
    regwidth = 5

    schema_version = SerializationUtil.DEFAULT_SCHEMA_VERSION
    type = HLLType.FULL
    padding = schema_version.padding_bytes(type)
    data_byte_count = probabilistic_test_util.get_required_bytes(
        regwidth, BitUtil.left_shift_int(1, log2m))  # aka 2^log2m = m
    expected_byte_count = padding + data_byte_count

    # Should work on an empty element
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    bytes = hll.to_bytes(schema_version)

    # assert output length is correct
    assert len(bytes) == expected_byte_count

    in_hll = HLL.from_bytes(bytes)
    assert_elements_equal(hll, in_hll)

    # Should work on a partially filled element
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    for i in range(0, 3):
        raw_value = probabilistic_test_util.construct_hll_value(
            log2m, i, (i + 9))
        hll.add_raw(raw_value)

    bytes = hll.to_bytes(schema_version)

    assert len(bytes) == expected_byte_count

    in_hll = HLL.from_bytes(bytes)

    # assert register values correct
    assert_elements_equal(hll, in_hll)

    # Should work on a full set
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)

    for i in range(0, BitUtil.left_shift_int(1, log2m)):
        raw_value = probabilistic_test_util.construct_hll_value(
            log2m, i, (i % 9) + 1)
        hll.add_raw(raw_value)

    bytes = hll.to_bytes(schema_version)

    # assert output length is correct
    assert len(bytes) == expected_byte_count

    in_hll = HLL.from_bytes(bytes)

    # assert register values correct
    assert_elements_equal(hll, in_hll)
Beispiel #9
0
def test_clear():
    """
    Tests HLL.clear().
    """
    regwidth = 5
    log2m = 4  # 16 registers per counter
    m = BitUtil.left_shift_int(1, log2m)

    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    bit_vector = hll._probabilistic_storage
    for i in range(0, m):
        bit_vector.set_register(i, i)

    hll.clear()
    for i in range(0, m):
        assert bit_vector.get_register(i) == 0  # default value of register
def test_promotion():
    """
    Tests promotion to ``HLLType.SPARSE`` and ``HLLType.FULL``.
    """
    explicit_threshold = 128
    hll = HLL.create_for_testing(11, 5, explicit_threshold, 256,
                                 HLLType.EXPLICIT)
    for i in range(0, explicit_threshold + 1):
        hll.add_raw(i)
    assert hll.get_type() == HLLType.SPARSE

    hll = HLL(11, 5, 4, False,
              HLLType.EXPLICIT)  # expthresh=4 => explicit_threshold=8
    for i in range(0, 9):
        hll.add_raw(i)
    assert hll.get_type() == HLLType.FULL
Beispiel #11
0
def test_register_value():
    """
    Tests the bounds on a register's value for a given raw input value.
    """
    log2m = 4  # small enough to make testing easy (add_raw() shifts by one byte)

    # register width 4 (the minimum size)
    regwidth = 4
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    bit_vector = hll._probabilistic_storage

    # lower-bounds of the register
    hll.add_raw(0x000000000000001)  # 'j'=1
    assert bit_vector.get_register(1) == 0

    hll.add_raw(0x0000000000000012)  # 'j'=2
    assert bit_vector.get_register(2) == 1

    hll.add_raw(0x0000000000000023)  # 'j'=3
    assert bit_vector.get_register(3) == 2

    hll.add_raw(0x0000000000000044)  # 'j'=4
    assert bit_vector.get_register(4) == 3

    hll.add_raw(0x0000000000000085)  # 'j'=5
    assert bit_vector.get_register(5) == 4

    # upper-bounds of the register
    # NOTE:  bear in mind that BitVector itself does ensure that
    #        overflow of a register is prevented
    hll.add_raw(0x0000000000010006)  # 'j'=6
    assert bit_vector.get_register(6) == 13

    hll.add_raw(0x0000000000020007)  # 'j'=7
    assert bit_vector.get_register(7) == 14

    hll.add_raw(0x0000000000040008)  # 'j'=8
    assert bit_vector.get_register(8) == 15

    hll.add_raw(0x0000000000080009)  # 'j'=9
    assert bit_vector.get_register(9) == 15  # overflow

    # sanity checks to ensure that no other bits above the lowest-set
    # bit matters
    # NOTE:  same as case 'j = 6' above
    hll.add_raw(0x000000000003000A)  # 'j'=10
    assert bit_vector.get_register(10) == 13

    hll.add_raw(0x000000000011000B)  # 'j'=11
    assert bit_vector.get_register(11) == 13

    # ------------------------------------------------------------
    # register width 5

    regwidth = 5
    hll = HLL.create_for_testing(log2m, regwidth, 128, 256, HLLType.FULL)
    bit_vector = hll._probabilistic_storage

    # lower-bounds of the register
    hll.add_raw(0x0000000000000001)  # 'j'=1
    assert bit_vector.get_register(1) == 0

    hll.add_raw(0x0000000000000012)  # 'j'=2
    assert bit_vector.get_register(2) == 1

    hll.add_raw(0x0000000000000023)  # 'j'=3
    assert bit_vector.get_register(3) == 2

    hll.add_raw(0x0000000000000044)  # 'j'=4
    assert bit_vector.get_register(4) == 3

    hll.add_raw(0x0000000000000085)  # 'j'=5
    assert bit_vector.get_register(5) == 4

    # upper-bounds of the register
    # NOTE:  bear in mind that BitVector itself does ensure that
    #        overflow of a register is prevented
    hll.add_raw(0x0000000100000006)  # 'j'=6
    assert bit_vector.get_register(6) == 29

    hll.add_raw(0x0000000200000007)  # 'j'=7
    assert bit_vector.get_register(7) == 30

    hll.add_raw(0x0000000400000008)  # 'j'=8
    assert bit_vector.get_register(8) == 31

    hll.add_raw(0x0000000800000009)  # 'j'=9
    assert bit_vector.get_register(9) == 31  # overflow
Beispiel #12
0
def assert_cardinality(hll_type, items, fastonly):
    # NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
    #       completes in a reasonable amount of time. Not much is gained by
    #       testing larger values - there are no more known serialization
    #       related edge cases that appear as log2m gets even larger.
    log2m_range = range(HLL.MINIMUM_LOG2M_PARAM, 16 + 1)
    regw_range = range(HLL.MINIMUM_REGWIDTH_PARAM,
                       HLL.MAXIMUM_REGWIDTH_PARAM + 1)
    expthr_range = range(HLL.MINIMUM_EXPTHRESH_PARAM,
                         HLL.MAXIMUM_EXPTHRESH_PARAM + 1)
    if fastonly:
        log2m_range = (HLL.MINIMUM_LOG2M_PARAM, 16)
        regw_range = (HLL.MINIMUM_REGWIDTH_PARAM, HLL.MAXIMUM_REGWIDTH_PARAM)
        expthr_range = (HLL.MINIMUM_EXPTHRESH_PARAM,
                        HLL.MAXIMUM_EXPTHRESH_PARAM)
    for log2m in log2m_range:
        for regw in regw_range:
            for expthr in expthr_range:
                for sparse in [True, False]:
                    hll = HLL(log2m, regw, expthr, sparse, hll_type)
                    for item in items:
                        hll.add_raw(item)
                    copy = HLL.from_bytes(hll.to_bytes())
                    assert copy.cardinality() == hll.cardinality()
                    assert copy.get_type() == hll.get_type()
                    assert copy.to_bytes() == hll.to_bytes()

                    clone = deepcopy(hll)
                    assert clone.cardinality() == hll.cardinality()
                    assert clone.get_type() == hll.get_type()
                    assert clone.to_bytes() == hll.to_bytes()

                    sys.stdout.write('.')
                    sys.stdout.flush()
Beispiel #13
0
import sys
import argparse
from python_hll.hll import HLL
import mmh3

parser = argparse.ArgumentParser(
    description='Estimate unique words in text file(s)')
parser.add_argument('file', metavar='F', type=str, help="csv")
parser.add_argument('log2m', metavar='L', type=int)
parser.add_argument('reg_width', metavar='R', type=int)
a = parser.parse_args()

hll = HLL(a.log2m, a.reg_width)
with open(a.file, 'r') as f:
    for token in f:
        hashed_value = mmh3.hash(token)
        hll.add_raw(hashed_value)
print(hll.cardinality())
def string_to_hll(s):
    """
    Converts a string (with \\x) to an HLL.
    """
    s = s[2:]
    return HLL.from_bytes(NumberUtil.from_hex(s, 0, len(s)))