Beispiel #1
0
    def hll_iter(strms):
        buf = {}
        fields = table._field_names
        fields.remove('hll')
        #  fields.remove('maxhash')

        for stream in strms:
            for line in stream:
                try:
                    data = decoder(line)
                except Exception as e:
                    print "Exception decoding record (skipping): %s %s" % (e, line)
                else:
                    if preprocess:
                        if not preprocess(data):
                            continue
                    key = ujson.dumps([data[f] for f in fields])
                    if key not in buf:
                        hll = Cardunion(12)
                        buf[key] = hll
                    else:
                        hll = buf[key]

                    hll.add(data[hll_field])

        for key, hll in buf.iteritems():
            data = dict(zip(fields, ujson.loads(key)))
            data['hll'] = hll.dumps()
            yield data
Beispiel #2
0
    def hll_iter(strms):
        buf = {}
        fields = table._field_names
        fields.remove('hll')
        #  fields.remove('maxhash')

        for stream in strms:
            for line in stream:
                try:
                    data = decoder(line)
                except Exception as e:
                    print "Exception decoding record (skipping): %s %s" % (
                        e, line)
                else:
                    if preprocess:
                        if not preprocess(data):
                            continue
                    key = ujson.dumps([data[f] for f in fields])
                    if key not in buf:
                        hll = Cardunion(12)
                        buf[key] = hll
                    else:
                        hll = buf[key]

                    hll.add(data[hll_field])

        for key, hll in buf.iteritems():
            data = dict(zip(fields, ujson.loads(key)))
            data['hll'] = hll.dumps()
            yield data
Beispiel #3
0
    def execute(self, set_size, m, p):
        hll = Cardunion(m)
        for i in range(set_size):
            hll.add(str(i))

        estimate = hll.count()
        error = abs(estimate / float(set_size) - 1)
        self.assertLess(error, p)
Beispiel #4
0
    def execute(self, set_size, m, p):
        hll = Cardunion(m)
        for i in range(set_size):
            hll.add(str(i))

        estimate = hll.count()
        error = abs(estimate / float(set_size) - 1)
        self.assertLess(error, p)
Beispiel #5
0
    def test_sparse_dumps(self):
        hll = Cardunion(self.log2m)
        hll_copy = Cardunion(self.log2m)
        for i in range(500):
            hll.add(str(i))

        hll_copy.loads(hll.dumps())
        self.assertEqual(hll.count(), hll_copy.count())
Beispiel #6
0
    def test_intersect_a_lot(self):
        hlls = []
        actual = 100000
        nset = 10
        for i in range(nset):
            hll = Cardunion()
            for j in range(actual):
                hll.add(str(i * 5000 + j))
            hlls.append(hll)

        estimate, error, _ = Cardunion.intersect(hlls)
        print estimate, error
        self.assertTrue(actual - (nset - 1) * 5000 - 3 * error
                        <= estimate <= actual - (nset - 1) * 5000 + 3 * error)
Beispiel #7
0
    def test_intersect_a_few(self):
        hll = Cardunion()
        hll_1 = Cardunion()
        hll_2 = Cardunion()
        for i in range(5000):
            hll.add(str(i))
        for i in range(1, 100000):
            hll_1.add(str(i))
        for i in range(25, 1000):
            hll_2.add(str(i))

        estimate, error, _ = Cardunion.intersect([hll_2, hll_1, hll])
        print estimate, error
        self.assertTrue(975 - 3 * error <= estimate <= 975 + 3 * error)
Beispiel #8
0
    def test_intersect_big_small(self):
        hll = Cardunion()
        hll_1 = Cardunion()
        for i in range(50):
            hll.add(str(i))
        for i in range(1, 100000):
            hll_1.add(str(i))

        estimate, error, _ = Cardunion.intersect([hll_1, hll])
        print estimate, error
        self.assertTrue(50 - 3 * error <= estimate <= 50 + 3 * error)
Beispiel #9
0
    def test_intersect_a_lot(self):
        hlls = []
        actual = 100000
        nset = 10
        for i in range(nset):
            hll = Cardunion()
            for j in range(actual):
                hll.add(str(i * 5000 + j))
            hlls.append(hll)

        estimate, error, _ = Cardunion.intersect(hlls)
        print estimate, error
        self.assertTrue(actual -
                        (nset - 1) * 5000 - 3 * error <= estimate <= actual -
                        (nset - 1) * 5000 + 3 * error)
Beispiel #10
0
    def test_union(self):
        hll = Cardunion(self.log2m)
        hll_1 = Cardunion(self.log2m)
        for i in range(10000):
            hll.add(str(i))
        for i in range(10000, 20000):
            hll_1.add(str(i))

        hll.union([hll_1])
        estimate = hll.count()
        error = abs(estimate / float(20000) - 1)
        self.assertLess(error, self.error)
Beispiel #11
0
    def test_intersect(self):
        """Since there is no theoretical error bound for intersection,
        we'd use 3-sigma rule instead.
        """
        hll = Cardunion()
        hll_1 = Cardunion()
        for i in range(10000):
            hll.add(str(i))
        for i in range(5000, 15000):
            hll_1.add(str(i))

        estimate, error, _ = Cardunion.intersect([hll_1, hll])
        print estimate, error
        self.assertTrue(5000 - 3 * error <= estimate <= 5000 + 3 * error)
Beispiel #12
0
    def test_intersect_big_small(self):
        hll = Cardunion()
        hll_1 = Cardunion()
        for i in range(50):
            hll.add(str(i))
        for i in range(1, 100000):
            hll_1.add(str(i))

        estimate, error, _ = Cardunion.intersect([hll_1, hll])
        print estimate, error
        self.assertTrue(50 - 3 * error <= estimate <= 50 + 3 * error)
Beispiel #13
0
    def test_with_heavy_duplicates(self):
        hll = Cardunion(self.log2m)
        set_size = 100000
        for i in range(set_size):
            if i % 2 or i < set_size / 2:
                hll.add(str(1))
            else:
                hll.add(str(i))

        estimate = hll.count()
        expected = set_size * 1.0 / 4.0
        error = abs(estimate / float(expected) - 1)
        self.assertLess(error, self.error)
Beispiel #14
0
    def test_intersect(self):
        """Since there is no theoretical error bound for intersection,
        we'd use 3-sigma rule instead.
        """
        hll = Cardunion()
        hll_1 = Cardunion()
        for i in range(10000):
            hll.add(str(i))
        for i in range(5000, 15000):
            hll_1.add(str(i))

        estimate, error, _ = Cardunion.intersect([hll_1, hll])
        print estimate, error
        self.assertTrue(5000 - 3 * error <= estimate <= 5000 + 3 * error)
Beispiel #15
0
    def test_with_heavy_duplicates(self):
        hll = Cardunion(self.log2m)
        set_size = 100000
        for i in range(set_size):
            if i % 2 or i < set_size / 2:
                hll.add(str(1))
            else:
                hll.add(str(i))

        estimate = hll.count()
        expected = set_size * 1.0 / 4.0
        error = abs(estimate / float(expected) - 1)
        self.assertLess(error, self.error)
Beispiel #16
0
 def _inner_deault():
     from cardunion import Cardunion
     return Cardunion(12)
Beispiel #17
0
    def test_bunion(self):
        hll = Cardunion(self.log2m)
        hll_1 = Cardunion(self.log2m)
        hll_2 = Cardunion(self.log2m)
        for i in range(10000):
            hll.add(str(i))
        for i in range(10000, 20000):
            hll_1.add(str(i))
        for i in range(20000, 30000):
            hll_2.add(str(i))

        hll.bunion([hll_1.dumps(), hll_2.dumps()])
        estimate = hll.count()
        error = abs(estimate / float(30000) - 1)
        self.assertLess(error, self.error)
Beispiel #18
0
    def test_union(self):
        hll = Cardunion(self.log2m)
        hll_1 = Cardunion(self.log2m)
        for i in range(10000):
            hll.add(str(i))
        for i in range(10000, 20000):
            hll_1.add(str(i))

        hll.union([hll_1])
        estimate = hll.count()
        error = abs(estimate / float(20000) - 1)
        self.assertLess(error, self.error)
Beispiel #19
0
    def test_sparse_dumps(self):
        hll = Cardunion(self.log2m)
        hll_copy = Cardunion(self.log2m)
        for i in range(500):
            hll.add(str(i))

        hll_copy.loads(hll.dumps())
        self.assertEqual(hll.count(), hll_copy.count())
Beispiel #20
0
 def test_nonzero_counters(self):
     h = Cardunion()
     h.update_counter(1, 2)
     h.update_counter(3, 4)
     h.update_counter(5, 8)
     self.assertEquals(list(h.nonzero_counters), [(1, 2), (3, 4), (5, 8)])
Beispiel #21
0
 def test_nonzero_counters(self):
     h = Cardunion()
     h.update_counter(1, 2)
     h.update_counter(3, 4)
     h.update_counter(5, 8)
     self.assertEquals(list(h.nonzero_counters), [(1, 2), (3, 4), (5, 8)])
Beispiel #22
0
    def test_intersect_a_few(self):
        hll = Cardunion()
        hll_1 = Cardunion()
        hll_2 = Cardunion()
        for i in range(5000):
            hll.add(str(i))
        for i in range(1, 100000):
            hll_1.add(str(i))
        for i in range(25, 1000):
            hll_2.add(str(i))

        estimate, error, _ = Cardunion.intersect([hll_2, hll_1, hll])
        print estimate, error
        self.assertTrue(975 - 3 * error <= estimate <= 975 + 3 * error)
Beispiel #23
0
    def test_bunion(self):
        hll = Cardunion(self.log2m)
        hll_1 = Cardunion(self.log2m)
        hll_2 = Cardunion(self.log2m)
        for i in range(10000):
            hll.add(str(i))
        for i in range(10000, 20000):
            hll_1.add(str(i))
        for i in range(20000, 30000):
            hll_2.add(str(i))

        hll.bunion([hll_1.dumps(), hll_2.dumps()])
        estimate = hll.count()
        error = abs(estimate / float(30000) - 1)
        self.assertLess(error, self.error)