class TestCardinalityEstimation(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_small_range_correction_all_registers_set_to_zero(self): self.assertEqual(self.hll.cardinality(), 0.0) def test_small_range_correction_not_all_registers_set_to_zero(self): self.hll.set_register(0, 1) c = self.hll.cardinality() correction = 1.46571806761 <= c and c <= 1.46571806762 self.assertTrue(correction) def test_medium_range_no_correction(self): for i in range(32): self.hll.set_register(i, 2) c = self.hll.cardinality() no_correction = 89.216 <= c and c <= 89.217 self.assertTrue(no_correction) @unittest.skip("correction value needs to be re-computed") def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
class TestCardinalityEstimation(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_small_range_correction_all_registers_set_to_zero(self): self.assertEqual(self.hll.cardinality(), 0.0) def test_small_range_correction_not_all_registers_set_to_zero(self): self.hll.set_register(0, 1) c = self.hll.cardinality() correction= 1.46571806761 <= c and c <= 1.46571806762 self.assertTrue(correction) def test_medium_range_no_correction(self): for i in range(32): self.hll.set_register(i, 2) c = self.hll.cardinality() no_correction = 89.216 <= c and c <= 89.217 self.assertTrue(no_correction) @unittest.skip("correction value needs to be re-computed") def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
class ProbabilisticCounter(object): # error_rate: 1% = 0.01, 0.5% = 0.005 (min 0.005) def __init__(self, error_rate=0.005): self.error_rate = error_rate # error_rate = 1.04 / sqrt(m) # m = 2 ** p -> registers count # M(1)... M(m) = 0 -> registers p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) self.hll = HyperLogLog(p) # returns: True - value is new, False, value already included def add(self, value): return self.hll.add(value) def count(self): return math.floor(self.hll.cardinality())
from HLL import HyperLogLog from generate_rand import gen hll = HyperLogLog(5) # use 2^5 registers for i in gen(): print(i) hll.add('some data') estimate = hll.cardinality()