Esempio n. 1
0
class TestCardinalityEstimation(unittest.TestCase):
    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_small_range_correction_all_registers_set_to_zero(self):
        self.assertEqual(self.hll.cardinality(), 0.0)

    def test_small_range_correction_not_all_registers_set_to_zero(self):
        self.hll.set_register(0, 1)
        c = self.hll.cardinality()
        correction = 1.46571806761 <= c and c <= 1.46571806762
        self.assertTrue(correction)

    def test_medium_range_no_correction(self):
        for i in range(32):
            self.hll.set_register(i, 2)

        c = self.hll.cardinality()
        no_correction = 89.216 <= c and c <= 89.217
        self.assertTrue(no_correction)

    @unittest.skip("correction value needs to be re-computed")
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 2
0
class TestCardinalityEstimation(unittest.TestCase):

    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_small_range_correction_all_registers_set_to_zero(self):
        self.assertEqual(self.hll.cardinality(), 0.0)

    def test_small_range_correction_not_all_registers_set_to_zero(self):
        self.hll.set_register(0, 1)
        c = self.hll.cardinality()
        correction= 1.46571806761 <= c and c <= 1.46571806762
        self.assertTrue(correction)

    def test_medium_range_no_correction(self):
        for i in range(32):
            self.hll.set_register(i, 2)

        c = self.hll.cardinality()
        no_correction = 89.216 <= c and c <= 89.217
        self.assertTrue(no_correction)

    @unittest.skip("correction value needs to be re-computed")
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 3
0
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 4
0
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 5
0
class ProbabilisticCounter(object):
    # error_rate: 1% = 0.01, 0.5% = 0.005 (min 0.005)
    def __init__(self, error_rate=0.005):
        self.error_rate = error_rate

        # error_rate = 1.04 / sqrt(m)
        # m = 2 ** p -> registers count
        # M(1)... M(m) = 0 -> registers

        p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2)))
        self.hll = HyperLogLog(p)

    # returns: True - value is new, False, value already included
    def add(self, value):
        return self.hll.add(value)

    def count(self):
        return math.floor(self.hll.cardinality())
Esempio n. 6
0
from HLL import HyperLogLog
from generate_rand import gen

hll = HyperLogLog(5)  # use 2^5 registers

for i in gen():
    print(i)
hll.add('some data')

estimate = hll.cardinality()