def eg1(): h = HyperLogLog() for d in data1: h.digest(sha1(d.encode('utf8'))) print("Estimated cardinality is", h.count()) s1 = set(data1) print("Actual cardinality is", len(s1))
def test_merge(self): h1 = HyperLogLog(4) h2 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h1.merge(h2) self.assertEqual(h1.reg[0b1111], 32 - 4) self.assertEqual(h1.reg[1], 1)
def test_merge(self): h1 = HyperLogLog(4) h2 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h1.merge(h2) self.assertEqual(h1.reg[0b1111], 64 - 4 + 1) self.assertEqual(h1.reg[1], 1)
def run_perf(card, p): h = HyperLogLog(p=p) logging.info("HyperLogLog using p = %d " % p) start = time.clock() for i in range(card): h.digest(sha1(int_bytes(i))) duration = time.clock() - start logging.info("Digested %d hashes in %.4f sec" % (card, duration)) return duration
def test_count(self): h = HyperLogLog(4) h.digest(FakeHash(0b00011111)) h.digest(FakeHash(0xfffffff1)) h.digest(FakeHash(0xfffffff5)) # We can't really verify the correctness here, just to make sure # no syntax error # See benchmarks for the accuracy of the cardinality estimation. h.count()
def _run_hyperloglog(A, B, data, seed, p): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() h1 = HyperLogLog(p=p) h2 = HyperLogLog(p=p) for i in xrange(a_start, a_end): h1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): h2.digest(Hash(hasher(data[i], seed=seed))) return _hyperloglog_inclusion(h1, h2)
def run_acc(size, seed, p): logging.info("HyperLogLog using p = %d " % p) h = HyperLogLog(p=p) s = set() random.seed(seed) for i in range(size): v = int_bytes(random.randint(1, size)) h.digest(sha1(v)) s.add(v) perr = abs(float(len(s)) - h.count()) / float(len(s)) return perr
def eg2(): h1 = HyperLogLog() h2 = HyperLogLog() for d in data1: h1.digest(sha1(d.encode('utf8'))) for d in data2: h2.digest(sha1(d.encode('utf8'))) u = HyperLogLog.union(h1, h2) print("Estimated union cardinality is", u.count()) s1 = set(data1) s2 = set(data2) su = s1.union(s2) print("Actual union cardinality is", len(su))
def test_jaccard(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.jaccard(h2), 0) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.jaccard(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.jaccard(h2), 1)
def test_inclusion(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.inclusion(h2), 0) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.inclusion(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertEqual(int(h1.inclusion(h2)), 1)
def test_union_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.count(), h1.union_count(h2))
def test_intersection_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b00011111)) h1.digest(FakeHash(0xfffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.intersection_count(h2), 0) h2.digest(FakeHash(0b00011111)) h2.digest(FakeHash(0xfffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.intersection_count(h2)), 3)
def test_digest(self): h = HyperLogLog(4) h.digest(FakeHash(0b00011111)) self.assertEqual(h.reg[0b1111], 32 - 4) h.digest(FakeHash(0xfffffff1)) self.assertEqual(h.reg[1], 1) h.digest(FakeHash(0x000000f5)) self.assertEqual(h.reg[5], 32 - 4 - 3)
def test_digest(self): h = HyperLogLog(4) h.digest(FakeHash(0b0001111)) self.assertEqual(h.reg[0b1111], 64 - 4 + 1) h.digest(FakeHash(0xfffffffffffffff1)) self.assertEqual(h.reg[1], 1) h.digest(FakeHash(0xfffffff5)) self.assertEqual(h.reg[5], 33)
def test_count(self): h = HyperLogLog(4) h.digest(FakeHash(0b0001111)) h.digest(FakeHash(0xfffffffffffffff1)) h.digest(FakeHash(0xfffffff5)) # We can't really verify the correctness here, just to make sure # no syntax error # See benchmarks for the accuracy of the cardinality estimation. h.count()
def test_pickle(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xfffffffffffffff1)) p = pickle.loads(pickle.dumps(h)) self.assertEqual(p.m, h.m) self.assertEqual(p.p, h.p) self.assertEqual(p.reg, h.reg)
def test_pickle(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xffffff1)) p = pickle.loads(pickle.dumps(h)) self.assertEqual(p.m, h.m) self.assertEqual(p.p, h.p) self.assertEqual(p.reg, h.reg)
def test_deserialize(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xfffffffffffffff1)) buf = bytearray(h.bytesize()) h.serialize(buf) hd = HyperLogLog.deserialize(buf) self.assertEqual(hd.p, h.p) self.assertEqual(hd.m, h.m) self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))
def test_deserialize(self): h = HyperLogLog(4) h.digest(FakeHash(123)) h.digest(FakeHash(33)) h.digest(FakeHash(12)) h.digest(FakeHash(0xfffffff1)) buf = bytearray(h.bytesize()) h.serialize(buf) hd = HyperLogLog.deserialize(buf) self.assertEqual(hd.p, h.p) self.assertEqual(hd.m, h.m) self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))
def _run_hyperloglog(data, seed, p): hasher = pyhash.murmur3_32() h = HyperLogLog(p=p) for d in data: h.digest(Hash(hasher(d, seed=seed))) return h.count()
def test_inclusion(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.inclusion(h2), 0) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.inclusion(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertEqual(int(h1.inclusion(h2)), 1)
def test_union_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(h1.count(), h1.union_count(h2)) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.count(), h1.union_count(h2))
def test_intersection_count(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.intersection_count(h2), 0) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.intersection_count(h2)), 3)
def test_jaccard(self): h1 = HyperLogLog(4) h1.digest(FakeHash(0b0001111)) h1.digest(FakeHash(0xfffffffffffffff1)) h1.digest(FakeHash(0xfffffff5)) h2 = HyperLogLog(4) self.assertEqual(h1.jaccard(h2), 0) h2.digest(FakeHash(0b0001111)) h2.digest(FakeHash(0xfffffffffffffff1)) h2.digest(FakeHash(0xfffffff5)) self.assertEqual(int(h1.jaccard(h2)), 1) h2.digest(FakeHash(0xfffffff6)) self.assertNotEqual(h1.jaccard(h2), 1)