def test_hyperloglog_small_card_est(self):
     reg = np.array([1 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_linearcounting') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
 def test_hyperloglog_small_card_est(self):
     reg = np.array([1 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_linearcounting') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
 def test_hyperloglog_large_card_est(self):
     reg = np.array([27 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_largerange_correction') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
 def test_hyperloglog_large_card_est(self):
     reg = np.array([27 for i in range(1 << 4)], dtype=np.int8)
     with patch.object(HyperLogLog, '_largerange_correction') as mock_method:
         mock_method.return_value = 0
         h = HyperLogLog(reg=reg)
         h.count()
     self.assertTrue(mock_method.called)
Beispiel #5
0
 def test_count(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b00011111))
     h.digest(FakeHash(0xfffffff1))
     h.digest(FakeHash(0xfffffff5))
     # We can't really verify the correctness here, just to make sure
     # no syntax error
     # See benchmarks for the accuracy of the cardinality estimation.
     h.count()
Beispiel #6
0
 def test_count(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b0001111))
     h.digest(FakeHash(0xfffffffffffffff1))
     h.digest(FakeHash(0xfffffff5))
     # We can't really verify the correctness here, just to make sure
     # no syntax error
     # See benchmarks for the accuracy of the cardinality estimation.
     h.count()
Beispiel #7
0
    def test_union_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.count(), h1.union_count(h2))
Beispiel #8
0
    def test_union_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.count(), h1.union_count(h2))
def eg1():
    h = HyperLogLog()
    for d in data1:
        h.digest(sha1(d.encode('utf8')))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
Beispiel #10
0
def eg1():
    h = HyperLogLog()
    for d in data1:
        h.digest(sha1(d.encode('utf8')))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
def run_acc(size, seed, p):
    logging.info("HyperLogLog using p = %d " % p)
    h = HyperLogLog(p=p)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        h.update(v)
        s.add(v)
    perr = abs(float(len(s)) - h.count()) / float(len(s))
    return perr
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p, hashobj=Hash)
    for d in data:
        h.update(hasher(d, seed=seed))
    return h.count()
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p)
    for d in data:
        h.digest(Hash(hasher(d, seed=seed)))
    return h.count()