def eg1():
    h = HyperLogLog()
    for d in data1:
        h.digest(sha1(d.encode('utf8')))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
Beispiel #2
0
 def test_merge(self):
     h1 = HyperLogLog(4)
     h2 = HyperLogLog(4)
     h1.digest(FakeHash(0b00011111))
     h2.digest(FakeHash(0xfffffff1))
     h1.merge(h2)
     self.assertEqual(h1.reg[0b1111], 32 - 4)
     self.assertEqual(h1.reg[1], 1)
Beispiel #3
0
 def test_merge(self):
     h1 = HyperLogLog(4)
     h2 = HyperLogLog(4)
     h1.digest(FakeHash(0b0001111))
     h2.digest(FakeHash(0xfffffffffffffff1))
     h1.merge(h2)
     self.assertEqual(h1.reg[0b1111], 64 - 4 + 1)
     self.assertEqual(h1.reg[1], 1)
Beispiel #4
0
def eg1():
    h = HyperLogLog()
    for d in data1:
        h.digest(sha1(d.encode('utf8')))
    print("Estimated cardinality is", h.count())

    s1 = set(data1)
    print("Actual cardinality is", len(s1))
def run_perf(card, p):
    h = HyperLogLog(p=p)
    logging.info("HyperLogLog using p = %d " % p)
    start = time.clock()
    for i in range(card):
        h.digest(sha1(int_bytes(i)))
    duration = time.clock() - start 
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
def run_perf(card, p):
    h = HyperLogLog(p=p)
    logging.info("HyperLogLog using p = %d " % p)
    start = time.clock()
    for i in range(card):
        h.digest(sha1(int_bytes(i)))
    duration = time.clock() - start
    logging.info("Digested %d hashes in %.4f sec" % (card, duration))
    return duration
Beispiel #7
0
 def test_count(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b00011111))
     h.digest(FakeHash(0xfffffff1))
     h.digest(FakeHash(0xfffffff5))
     # We can't really verify the correctness here, just to make sure
     # no syntax error
     # See benchmarks for the accuracy of the cardinality estimation.
     h.count()
def _run_hyperloglog(A, B, data, seed, p):
    (a_start, a_end), (b_start, b_end) = A, B
    hasher = pyhash.murmur3_32()
    h1 = HyperLogLog(p=p)
    h2 = HyperLogLog(p=p)
    for i in xrange(a_start, a_end):
        h1.digest(Hash(hasher(data[i], seed=seed)))
    for i in xrange(b_start, b_end):
        h2.digest(Hash(hasher(data[i], seed=seed)))
    return _hyperloglog_inclusion(h1, h2)
def run_acc(size, seed, p):
    logging.info("HyperLogLog using p = %d " % p)
    h = HyperLogLog(p=p)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        h.digest(sha1(v))
        s.add(v)
    perr = abs(float(len(s)) - h.count()) / float(len(s))
    return perr
def run_acc(size, seed, p):
    logging.info("HyperLogLog using p = %d " % p)
    h = HyperLogLog(p=p)
    s = set()
    random.seed(seed)
    for i in range(size):
        v = int_bytes(random.randint(1, size))
        h.digest(sha1(v))
        s.add(v)
    perr = abs(float(len(s)) - h.count()) / float(len(s))
    return perr
def eg2():
    h1 = HyperLogLog()
    h2 = HyperLogLog()
    for d in data1:
        h1.digest(sha1(d.encode('utf8')))
    for d in data2:
        h2.digest(sha1(d.encode('utf8')))
    u = HyperLogLog.union(h1, h2)
    print("Estimated union cardinality is", u.count())

    s1 = set(data1)
    s2 = set(data2)
    su = s1.union(s2)
    print("Actual union cardinality is", len(su))
Beispiel #12
0
    def test_jaccard(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.jaccard(h2), 0)

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.jaccard(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.jaccard(h2), 1)
Beispiel #13
0
    def test_inclusion(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.inclusion(h2), 0)

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.inclusion(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertEqual(int(h1.inclusion(h2)), 1)
Beispiel #14
0
    def test_union_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.count(), h1.union_count(h2))
Beispiel #15
0
    def test_intersection_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b00011111))
        h1.digest(FakeHash(0xfffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.intersection_count(h2), 0)

        h2.digest(FakeHash(0b00011111))
        h2.digest(FakeHash(0xfffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.intersection_count(h2)), 3)
Beispiel #16
0
 def test_digest(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b00011111))
     self.assertEqual(h.reg[0b1111], 32 - 4)
     h.digest(FakeHash(0xfffffff1))
     self.assertEqual(h.reg[1], 1)
     h.digest(FakeHash(0x000000f5))
     self.assertEqual(h.reg[5], 32 - 4 - 3)
Beispiel #17
0
 def test_digest(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b0001111))
     self.assertEqual(h.reg[0b1111], 64 - 4 + 1)
     h.digest(FakeHash(0xfffffffffffffff1))
     self.assertEqual(h.reg[1], 1)
     h.digest(FakeHash(0xfffffff5))
     self.assertEqual(h.reg[5], 33)
Beispiel #18
0
 def test_count(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(0b0001111))
     h.digest(FakeHash(0xfffffffffffffff1))
     h.digest(FakeHash(0xfffffff5))
     # We can't really verify the correctness here, just to make sure
     # no syntax error
     # See benchmarks for the accuracy of the cardinality estimation.
     h.count()
Beispiel #19
0
 def test_pickle(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xfffffffffffffff1))
     p = pickle.loads(pickle.dumps(h))
     self.assertEqual(p.m, h.m)
     self.assertEqual(p.p, h.p)
     self.assertEqual(p.reg, h.reg)
Beispiel #20
0
 def test_pickle(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xffffff1))
     p = pickle.loads(pickle.dumps(h))
     self.assertEqual(p.m, h.m)
     self.assertEqual(p.p, h.p)
     self.assertEqual(p.reg, h.reg)
Beispiel #21
0
 def test_deserialize(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xfffffffffffffff1))
     buf = bytearray(h.bytesize())
     h.serialize(buf)
     hd = HyperLogLog.deserialize(buf)
     self.assertEqual(hd.p, h.p)
     self.assertEqual(hd.m, h.m)
     self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))
Beispiel #22
0
 def test_deserialize(self):
     h = HyperLogLog(4)
     h.digest(FakeHash(123))
     h.digest(FakeHash(33))
     h.digest(FakeHash(12))
     h.digest(FakeHash(0xfffffff1))
     buf = bytearray(h.bytesize())
     h.serialize(buf)
     hd = HyperLogLog.deserialize(buf)
     self.assertEqual(hd.p, h.p)
     self.assertEqual(hd.m, h.m)
     self.assertTrue(all(i == j for i, j in zip(h.reg, hd.reg)))
def _run_hyperloglog(data, seed, p):
    hasher = pyhash.murmur3_32()
    h = HyperLogLog(p=p)
    for d in data:
        h.digest(Hash(hasher(d, seed=seed)))
    return h.count()
Beispiel #24
0
    def test_inclusion(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.inclusion(h2), 0)

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.inclusion(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertEqual(int(h1.inclusion(h2)), 1)
Beispiel #25
0
    def test_union_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(h1.count(), h1.union_count(h2))

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.count(), h1.union_count(h2))
Beispiel #26
0
    def test_intersection_count(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.intersection_count(h2), 0)

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.intersection_count(h2)), 3)
Beispiel #27
0
    def test_jaccard(self):
        h1 = HyperLogLog(4)
        h1.digest(FakeHash(0b0001111))
        h1.digest(FakeHash(0xfffffffffffffff1))
        h1.digest(FakeHash(0xfffffff5))
        h2 = HyperLogLog(4)
        self.assertEqual(h1.jaccard(h2), 0)

        h2.digest(FakeHash(0b0001111))
        h2.digest(FakeHash(0xfffffffffffffff1))
        h2.digest(FakeHash(0xfffffff5))
        self.assertEqual(int(h1.jaccard(h2)), 1)

        h2.digest(FakeHash(0xfffffff6))
        self.assertNotEqual(h1.jaccard(h2), 1)