Esempio n. 1
0
 def test(cls, count=100000, registers=512):
   hll = HyperLogLog(registers)
   for i in range(count):
     #r = int(random.random() * 100000000)
     r = "".join([string.ascii_letters[random.randint(0, len(string.ascii_letters)-1)] for n in range(30)])
     hll.add(str(r))
   print hll.getEstimatedCardinality()
Esempio n. 2
0
 def test_init(self):
     s = HyperLogLog(0.05)
     s.upgrade()
     self.assertEqual(s.p, 9)
     self.assertEqual(s.alpha, 0.7197831133217303)
     self.assertEqual(s.m, 512)
     self.assertEqual(len(s.M), 512)
Esempio n. 3
0
 def test_pickle(self):
     a = HyperLogLog(0.05)
     for x in range(100):
         a.add(str(x))
     b = pickle.loads(pickle.dumps(a))
     self.assertEqual(a.M, b.M)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
Esempio n. 4
0
class TestHyperLogLog(unittest.TestCase, BasicHLLTests):
    def setUp(self):
        self.hll = HyperLogLog(16, 16)
        self.hll2 = HyperLogLog(16, 16)

    def test_a_repeated_element_is_ignored(self):
        self.hll.add_object(37)
        self.hll2.add_objects([ 37 for x in range(0, 1000) ])
        self.assertEqual(self.hll.logs, self.hll2.logs)
Esempio n. 5
0
    def test_add(self):
        s = HyperLogLog(0.05)

        for i in range(10):
            s.add(str(i))

        M = [(i, v) for i, v in enumerate(s.M) if v > 0]

        self.assertEqual(M, [(31, 1), (120, 1), (122, 4), (151, 5), (171, 3), (176, 1), (196, 1), (268, 1), (443, 2), (474, 1)])
Esempio n. 6
0
 def test_pickle(self):
     a = HyperLogLog(0.05)
     for x in range(100):
         a.add(str(x))
     b = pickle.loads(pickle.dumps(a))
     self.assertEqual(a.M, b.M)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
Esempio n. 7
0
    def test_add(self):
        s = HyperLogLog(0.05)

        for i in range(10):
            s.add(str(i))

        M = [(i, v) for i, v in enumerate(s.M) if v > 0]

        self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
Esempio n. 8
0
    def test_add(self):
        s = HyperLogLog(0.05)

        for i in range(10):
            s.add(str(i))

        M = [(i, v) for i, v in enumerate(s.M) if v > 0]

        self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4),
                             (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
Esempio n. 9
0
 def test_pickle(self):
     a = HyperLogLog(0.05)
     for x in range(100):
         a.add(str(x))
     a.upgrade()
     b = pickle.loads(pickle.dumps(a))
     numpy.testing.assert_array_equal(a.M, b.M)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
     self.assertEqual(len(a), len(b))
Esempio n. 10
0
    def test_calc_cardinality(self):

        for cardinality in (1, 2, 3, 5, 10, 1500, 100000, 1000000):
            a = HyperLogLog(0.05)

            for i in xrange(cardinality):
                a.add(str(i))

            #print cardinality, len(a), a.m, cardinality * (1.0 - 1.04 / math.sqrt(a.m)), cardinality * (1.0 + 1.04 / math.sqrt(a.m))
            self.assertGreater(len(a), cardinality * (1.0 - 1.04 / math.sqrt(a.m)))
            self.assertLess(len(a), cardinality * (1.0 + 1.04 / math.sqrt(a.m)))
Esempio n. 11
0
    def test_hll(self):
        m = 16384
        f1 = tempfile.NamedTemporaryFile('r+b')
        f1.write(''.join('\x00' for i in range(m)))
        f1.flush()
        mfile1 = mmap.mmap(f1.fileno(),0)
        mslice1 = MmapSlice(mfile1, m, 0)
        test = HyperLogLog(0.01, mslice1)

        self.assertEqual(len(test), 0)
        test.add('test_val')
        self.assertEqual(len(test), 1)
Esempio n. 12
0
    def test_hll(self):
        m = 16384
        f1 = tempfile.NamedTemporaryFile('r+b')
        f1.write(''.join('\x00' for i in range(m)))
        f1.flush()
        mfile1 = mmap.mmap(f1.fileno(), 0)
        mslice1 = MmapSlice(mfile1, m, 0)
        test = HyperLogLog(0.01, mslice1)

        self.assertEqual(len(test), 0)
        test.add('test_val')
        self.assertEqual(len(test), 1)
Esempio n. 13
0
    def test_add(self):
        s = HyperLogLog(0.05)

        for i in range(10):
            s.add(str(i))

        s.upgrade()

        M = [(i, v) for i, v in enumerate(s.M) if v > 0]

        numpy.testing.assert_array_equal(
            M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
 def test_3(self):
     size = int(1e5)
     data = np.random.randint(low=(-1)*int(1e10), high=int(1e10), size=size)
     b = int(np.log2(size))  # Num bits needed.
     hll = HyperLogLog(hash_fn=hashlib.sha1, num_bits=b)
     my_set = set()
     for val in data:
         hll.add(val)
         my_set.add(val)
     my_set_n = len(my_set)
     hll_n = hll.get_num_distinct()
     error_rate = np.abs(float(my_set_n-hll_n)/float(my_set_n+hll_n))
     accepted_error_rate = np.abs(standard_error(2**b))
     assert(error_rate <= accepted_error_rate)
Esempio n. 15
0
    def test_update(self):
        f = tempfile.TemporaryFile()
        m = 16384
        flen = (m*3) + mmap.PAGESIZE - (m*3) % mmap.PAGESIZE

        self.assertGreater(flen, m*3)

        f.write(''.join(['\x00' for i in range(flen)]))
        fmap = mmap.mmap(f.fileno(), m*3)

        self.assertEqual(len(fmap), m*3)


        mslice1 = MmapSlice(fmap, m, 0)
        mslice2 = MmapSlice(fmap, m, m)
        mslice3 = MmapSlice(fmap, m, m*2)

        hll1 = HyperLogLog(self.error_rate, mslice1)
        hll2 = HyperLogLog(self.error_rate, mslice2)
        hll3 = HyperLogLog(self.error_rate, mslice3)

        for v in self.test_data1:
            hll1.add(v)
        for v in self.test_data2:
            hll2.add(v)

        hll1.update(hll2)

        self.assertAlmostEqual(self.test_set_size*2, len(hll1), delta=self.test_set_size*2*self.error_rate)
Esempio n. 16
0
    def test_mmap(self):
        f = tempfile.TemporaryFile()
        m = 16384
        flen = m + mmap.PAGESIZE - m % mmap.PAGESIZE

        self.assertGreater(flen, m)

        f.write(''.join(['\x00' for i in range(flen)]))
        fmap = mmap.mmap(f.fileno(), m)

        self.assertEqual(len(fmap), m)

        mslice = MmapSlice(fmap, m)
        hll = HyperLogLog(self.error_rate, mslice)
        for v in self.test_data1:
            hll.add(v)
        self.assertAlmostEqual(self.test_set_size, len(hll), delta=self.test_set_size*self.error_rate)
Esempio n. 17
0
    def test_calc_cardinality(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in xrange(n):
                a = HyperLogLog(rel_err)

                for i in xrange(card):
                    a.add(os.urandom(20))

                s += a.card()

            z = (float(s) / n - card) / (rel_err * card / math.sqrt(n))
            self.assertLess(-1.96, z)
            self.assertGreater(1.96, z)
Esempio n. 18
0
    def test_calc_cardinality(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in xrange(n):
                a = HyperLogLog(rel_err)

                for i in xrange(card):
                    a.add(os.urandom(20))

                s += a.card()

            z = (float(s) / n - card) / (rel_err * card / math.sqrt(n))
            self.assertLess(-1.96, z)
            self.assertGreater(1.96, z)
Esempio n. 19
0
    def test_mmap(self):
        f = tempfile.TemporaryFile()
        m = 16384
        flen = m + mmap.PAGESIZE - m % mmap.PAGESIZE

        self.assertGreater(flen, m)

        f.write(''.join(['\x00' for i in range(flen)]))
        fmap = mmap.mmap(f.fileno(), m)

        self.assertEqual(len(fmap), m)

        mslice = MmapSlice(fmap, m)
        hll = HyperLogLog(self.error_rate, mslice)
        for v in self.test_data1:
            hll.add(v)
        self.assertAlmostEqual(self.test_set_size,
                               len(hll),
                               delta=self.test_set_size * self.error_rate)
Esempio n. 20
0
 def test_alpha(self):
     alpha = [HyperLogLog._get_alpha(b) for b in range(4, 10)]
     self.assertEqual(alpha, [0.673, 0.697, 0.709, 0.7152704932638152, 0.7182725932495458, 0.7197831133217303])
Esempio n. 21
0
    def test_update(self):
        a = HyperLogLog(0.05)
        b = HyperLogLog(0.05)
        c = HyperLogLog(0.05)

        for i in xrange(2):
            a.add(str(i))
            c.add(str(i))

        for i in xrange(2, 4):
            b.add(str(i))
            c.add(str(i))

        a.update(b)

        self.assertNotEqual(a, b)
        self.assertNotEqual(b, c)
        self.assertEqual(a, c)
Esempio n. 22
0
 def setUp(self):
     self.hll = HyperLogLog(16, 16)
     self.hll2 = HyperLogLog(16, 16)
Esempio n. 23
0
 def test_init(self):
     s = HyperLogLog(0.05)
     self.assertEqual(s.p, 9)
     self.assertEqual(s.alpha, 0.7197831133217303)
     self.assertEqual(s.m, 512)
     self.assertEqual(len(s.M), 512)
Esempio n. 24
0
import logging

from hll import HyperLogLog, MartingaleHyperLogLog

logging.basicConfig(level=logging.DEBUG)

hll = HyperLogLog(10, 54)
mhll = MartingaleHyperLogLog(10, 54)

f = open('random_ints', 'r')
for l in f.readlines():
    i = int(l)
    hll.add_object(i)
    mhll.add_object(i)

print hll.unadjusted_count
print mhll.count
Esempio n. 25
0
    def test_update_err(self):
        a = HyperLogLog(0.05)
        b = HyperLogLog(0.01)

        self.assertRaises(ValueError, a.update, b)
Esempio n. 26
0
    def test_update(self):
        a = HyperLogLog(0.05)
        b = HyperLogLog(0.05)
        c = HyperLogLog(0.05)

        for i in xrange(2):
            a.add(str(i))
            c.add(str(i))

        for i in xrange(2, 4):
            b.add(str(i))
            c.add(str(i))

        a.update(b)

        self.assertNotEqual(a, b)
        self.assertNotEqual(b, c)
        self.assertEqual(a, c)
Esempio n. 27
0
    def test_update3(self):
        f = tempfile.TemporaryFile()
        m = 16384
        flen = (m * 3) + mmap.PAGESIZE - (m * 3) % mmap.PAGESIZE

        self.assertGreater(flen, m * 3)

        f.write(''.join(['\x00' for i in range(flen)]))
        fmap = mmap.mmap(f.fileno(), m * 3)

        self.assertEqual(len(fmap), m * 3)

        mslice1 = MmapSlice(fmap, m, 0)
        mslice2 = MmapSlice(fmap, m, m)
        mslice3 = MmapSlice(fmap, m, m * 2)

        hll1 = HyperLogLog(self.error_rate, mslice1)
        hll2 = HyperLogLog(self.error_rate, mslice2)
        hll3 = HyperLogLog(self.error_rate, mslice3)

        for v in self.test_data1:
            hll1.add(v)
        for v in self.test_data2:
            hll2.add(v)
        for v in self.test_data3:
            hll3.add(v)

        hll1.update([hll2, hll3])

        self.assertAlmostEqual(self.test_set_size * 3,
                               len(hll1),
                               delta=self.test_set_size * 3 * self.error_rate)
Esempio n. 28
0
from unittest import TestCase
from hll import HyperLogLog
import math

class HyperLogLogTestCase(TestCase):
    def test_alpha(self):
        alpha = [HyperLogLog._get_alpha(b) for b in range(4, 10)]
        self.assertEqual(alpha, [0.673, 0.697, 0.709, 0.7152704932638152, 0.7182725932495458, 0.7197831133217303])
    
    def test_alpha_bad(self):
        self.assertRaises(ValueError, HyperLogLog._get_alpha, 1)
        self.assertRaises(ValueError,HyperLogLog. _get_alpha, 17)

    def test_rho(self):
        arr = [ 1L << i for i in range(32 + 1) ]
        self.assertEqual(HyperLogLog._get_rho(0, arr), 33)
        self.assertEqual(HyperLogLog._get_rho(1, arr), 32)
        self.assertEqual(HyperLogLog._get_rho(2, arr), 31)
        self.assertEqual(HyperLogLog._get_rho(3, arr), 31)
        self.assertEqual(HyperLogLog._get_rho(4, arr), 30)
        self.assertEqual(HyperLogLog._get_rho(5, arr), 30)
        self.assertEqual(HyperLogLog._get_rho(6, arr), 30)
        self.assertEqual(HyperLogLog._get_rho(7, arr), 30)
        self.assertEqual(HyperLogLog._get_rho(1 << 31, arr), 1)
        self.assertRaises(ValueError, HyperLogLog._get_rho, 1 << 32, arr)

    def test_init(self):
        s = HyperLogLog(0.05)
        self.assertEqual(s.b, 9)
        self.assertEqual(s.alpha, 0.7197831133217303)
        self.assertEqual(s.m, 512)