def test_hyperloglog_small_card_est(self): reg = np.array([1 for i in range(1 << 4)], dtype=np.int8) with patch.object(HyperLogLog, '_linearcounting') as mock_method: mock_method.return_value = 0 h = HyperLogLog(reg=reg) h.count() self.assertTrue(mock_method.called)
def test_hyperloglog_large_card_est(self): reg = np.array([27 for i in range(1 << 4)], dtype=np.int8) with patch.object(HyperLogLog, '_largerange_correction') as mock_method: mock_method.return_value = 0 h = HyperLogLog(reg=reg) h.count() self.assertTrue(mock_method.called)
def test_count(self): h = HyperLogLog(4) h.digest(FakeHash(0b00011111)) h.digest(FakeHash(0xfffffff1)) h.digest(FakeHash(0xfffffff5)) # We can't really verify the correctness here, just to make sure # no syntax error # See benchmarks for the accuracy of the cardinality estimation. h.count()
def test_count(self): h = HyperLogLog(4) h.digest(FakeHash(0b0001111)) h.digest(FakeHash(0xfffffffffffffff1)) h.digest(FakeHash(0xfffffff5)) # We can't really verify the correctness here, just to make sure # no syntax error # See benchmarks for the accuracy of the cardinality estimation. h.count()
def test_union_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.count(), h1.union_count(h2))
def test_union_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.count(), h1.union_count(h2))
def eg1(): h = HyperLogLog() for d in data1: h.digest(sha1(d.encode('utf8'))) print("Estimated cardinality is", h.count()) s1 = set(data1) print("Actual cardinality is", len(s1))
def run_acc(size, seed, p): logging.info("HyperLogLog using p = %d " % p) h = HyperLogLog(p=p) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) h.update(v) s.add(v) perr = abs(float(len(s)) - h.count()) / float(len(s)) return perr
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p, hashobj=Hash) for d in data: h.update(hasher(d, seed=seed)) return h.count()
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p) for d in data: h.digest(Hash(hasher(d, seed=seed))) return h.count()