Esempio n. 1
0
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 2
0
    def __init__(self, error_rate=0.005):
        self.error_rate = error_rate

        # error_rate = 1.04 / sqrt(m)
        # m = 2 ** p -> registers count
        # M(1)... M(m) = 0 -> registers

        p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2)))
        self.hll = HyperLogLog(p)
Esempio n. 3
0
 def _hll_init(v):
     hll = HyperLogLog(k)
     zero = hll.registers()
     def regs(x):
         hll.set_registers(zero);
         if x is not None:
             hll.add(str(x));
         return hll.registers()
     return v.apply(lambda x: regs(x))
Esempio n. 4
0
 def _hll_merge(v):
     hll_res = HyperLogLog(k)
     hll = HyperLogLog(k)
     for x in v:
         hll.set_registers(bytearray(x))
         hll_res.merge(hll)
     return hll_res.registers()
Esempio n. 5
0
class TestAdd(unittest.TestCase):
    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_add_string(self):
        try:
            self.hll.add('asdf')
        except Exception as ex:
            self.fail('failed to add string: %s' % ex)

    @unittest.skipIf(sys.version_info[0] > 2,
                     'buffer is deprecated in python 3.x')
    def test_add_buffer(self):
        try:
            self.hll.add(buffer('asdf'))
        except Exception as ex:
            self.fail('failed to add buffer: %s' % ex)

    def test_add_bytes(self):
        try:
            self.hll.add(b'asdf')
        except Exception as ex:
            self.fail('failed to add bytes: %s' % ex)

    def test_cardinality_estimate_changed_return(self):
        changed = self.hll.add('asdf')
        self.assertTrue(changed)
        changed = self.hll.add('asdf')
        self.assertFalse(changed)
        changed = self.hll.add('otherdata')
        self.assertTrue(changed)
Esempio n. 6
0
 def _hll_merge(v: pd.DataFrame) -> bytes:
     hll_res = HyperLogLog(k)
     hll = HyperLogLog(k)
     for x in v:
         hll.set_registers(bytearray(x))
         hll_res.merge(hll)
     return hll_res.registers()
Esempio n. 7
0
def _stats_from_json(json: Dict[str, Any]) -> Dict[str, Any]:
    stats = {}
    if 'messages_sent' in json:
        stats['messages_sent'] = json['messages_sent']
    if 'messages_received' in json:
        stats['messages_received'] = json['messages_received']
    if 'users_active' in json:
        hll = HyperLogLog(12)
        hll.set_registers(bytearray(json['users_active']))
        stats['users_active'] = hll
    return stats
Esempio n. 8
0
 def getOutliers(self):
     """ must return a dict of (key, sigmas > sigmaCount)
     """
     outliers = {}
     s = Stdev()
     h = []
     topN = self.topN
     for key, hll in self.shortCardDict.items():
         shortCount = hll.cardinality()  
         s.add(shortCount)
         if len(h) < topN:
             heapq.heappush( h, (shortCount, key) )
         else:
             heapq.heappushpop( h, (shortCount, key) )
     mean = s.getMean()
     stdv = s.getStdev()
     for cnt, key in h:
         if cnt > mean + self.sigmaCount * stdv:
             sigs = (cnt - mean) / stdv
             outliers[key] = sigs
     for key in self.timePeriodMap.getActives():
         cnt = hll.cardinality()
         if cnt > mean + self.sigmaCount * stdv:
             sigs = (cnt - mean) / stdv
             outliers[key] = sigs
     # empty short-term HLLs
     for key in self.shortCardDict.keys():
         self.shortCardDict[key] = HyperLogLog(16)
     return outliers
Esempio n. 9
0
    def test_merge(self):
        expected = bytearray(4)
        expected[0] = 1
        expected[3] = 1

        hll = HyperLogLog(2)
        hll2 = HyperLogLog(2)

        hll.set_register(0, 1)
        hll2.set_register(3, 1)

        hll.merge(hll2)
        self.assertEqual(hll.registers(), expected)
Esempio n. 10
0
    def _hll_count(v: pd.Series) -> pd.Series:
        hll = HyperLogLog(k)

        def count(hll, x):
            hll.set_registers(bytearray(x))
            return int(hll.cardinality())

        return v.apply(lambda x: count(hll, x))
Esempio n. 11
0
    def setUp(self):
        hlls = [HyperLogLog(x, randint(1, 1000)) for x in range(4, 16)]
        cardinalities = [x**5 for x in range(1, 16)]

        for hll, n in zip(hlls, cardinalities):
            for i in range(1, n):
                hll.add(str(i))
        self.hlls = hlls
Esempio n. 12
0
    def test_seed_parameter_sets_seed(self):
        hll = HyperLogLog(5, seed=4)
        self.assertEqual(hll.seed(), 4)

        hll2 = HyperLogLog(5, seed=2)
        self.assertNotEqual(hll.murmur3_hash('test'),
                            hll2.murmur3_hash('test'))
Esempio n. 13
0
class TestCardinalityEstimation(unittest.TestCase):
    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_small_range_correction_all_registers_set_to_zero(self):
        self.assertEqual(self.hll.cardinality(), 0.0)

    def test_small_range_correction_not_all_registers_set_to_zero(self):
        self.hll.set_register(0, 1)
        c = self.hll.cardinality()
        correction = 1.46571806761 <= c and c <= 1.46571806762
        self.assertTrue(correction)

    def test_medium_range_no_correction(self):
        for i in range(32):
            self.hll.set_register(i, 2)

        c = self.hll.cardinality()
        no_correction = 89.216 <= c and c <= 89.217
        self.assertTrue(no_correction)

    @unittest.skip("correction value needs to be re-computed")
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 14
0
class TestCardinalityEstimation(unittest.TestCase):

    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_small_range_correction_all_registers_set_to_zero(self):
        self.assertEqual(self.hll.cardinality(), 0.0)

    def test_small_range_correction_not_all_registers_set_to_zero(self):
        self.hll.set_register(0, 1)
        c = self.hll.cardinality()
        correction= 1.46571806761 <= c and c <= 1.46571806762
        self.assertTrue(correction)

    def test_medium_range_no_correction(self):
        for i in range(32):
            self.hll.set_register(i, 2)

        c = self.hll.cardinality()
        no_correction = 89.216 <= c and c <= 89.217
        self.assertTrue(no_correction)

    @unittest.skip("correction value needs to be re-computed")
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 15
0
class ProbabilisticCounter(object):
    # error_rate: 1% = 0.01, 0.5% = 0.005 (min 0.005)
    def __init__(self, error_rate=0.005):
        self.error_rate = error_rate

        # error_rate = 1.04 / sqrt(m)
        # m = 2 ** p -> registers count
        # M(1)... M(m) = 0 -> registers

        p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2)))
        self.hll = HyperLogLog(p)

    # returns: True - value is new, False, value already included
    def add(self, value):
        return self.hll.add(value)

    def count(self):
        return math.floor(self.hll.cardinality())
 def __init__(self, sigmaCount=5, period=86400, tolerance=0.001):
     super().__init__(sigmaCount, period=period)
     self.longCardDict = defaultdict(lambda: HyperLogLog(16))
     self.prevLongCard = {}
     self.slopeDict = defaultdict(lambda: ILS())
     #self.slopeDict = defaultdict(lambda: SlopeWindow())
     self.avgDict = defaultdict(lambda: Stdev())
     #self.avgDict = defaultdict(lambda: AverageWindow())
     self.totalCount = 0
     self.updatePeriod = 0
     self.tolerance = tolerance
     self.frozenHosts = set()
     self.everFrozen = set()  #HyperLogLog(16)
Esempio n. 17
0
    def test_large_range_correction(self):
        hll = HyperLogLog(16)
        for i in range(hll.size() - 1):
            hll.set_register(i, 16)

        c = hll.cardinality()
        correction = 7916284520 <= c and c <= 7916284521
        self.assertTrue(correction)
Esempio n. 18
0
def harmonic_centrality(G, max_distance=6):
    harmonic = defaultdict(lambda: 0)
    t_steps_set = defaultdict(lambda: defaultdict(lambda: HyperLogLog(10)))
    for node in G.nodes_iter():
        t_steps_set[node][0].add(str(node))
    for distance in range(1, max_distance + 1):
        for node in G.nodes_iter():
            t_steps_set[node][distance].merge(t_steps_set[node][distance - 1])
            for next_node in networkx.all_neighbors(G, node):
                t_steps_set[node][distance].merge(
                    t_steps_set[next_node][distance - 1])
            harmonic[node] += (
                t_steps_set[node][distance].cardinality() -
                t_steps_set[node][distance - 1].cardinality()) / distance
    return dict(harmonic)
Esempio n. 19
0
 def _collect(self, stat: str, user: User, client: Client) -> None:
     assert user is not None
     assert client is not None
     if self.by_client is None:
         self.by_client = {}
     bc = self.by_client
     client_id = self._get_client_id(client)
     if client_id not in bc:
         bc[client_id] = {}
     bhc = bc[client_id]
     if stat == 'users_active':
         if stat not in bhc:
             bhc[stat] = HyperLogLog(12)
         bhc[stat].add(user.email)
     else:
         if stat not in bhc:
             bhc[stat] = 0
         bhc[stat] += 1
 def getOutliers(self):
     """ must return a dict of (key, sigmas > sigmaCount)
     """
     outliers = {}
     topN = self.topN
     for key, hll in self.shortCardDict.items():
         shortCount = hll.cardinality() 
         prevMean = self.stdevDict[key].getMean()
         prevStdev = self.stdevDict[key].getStdev()
         if shortCount >= prevMean + prevStdev * self.sigmaCount:
             sigs = (shortCount - prevMean) / prevStdev
             outliers[key] = sigs
         # update stdevDict, while we've got the information to do so.
         self.stdevDict[key].add(shortCount)
     # empty short-term HLLs
     for key in self.shortCardDict.keys():
         self.shortCardDict[key] = HyperLogLog(16)
     return outliers
Esempio n. 21
0
 def getOutliersAll(self):
     """ must return a dict of (key, sigmas > sigmaCount)
     """
     outliers = {}
     s = Stdev()
     for key, hll in self.shortCardDict.items():
         cnt = hll.cardinality()
         s.add(cnt)
     mean = s.getMean()
     stdv = s.getStdev()
     for key, hll in self.cardinalityDict.items():
         cnt = hll.cardinality()
         if cnt > mean + self.sigmaCount * stdv:
             sigs = (cnt - mean) / stdv
             outliers[key] = sigs
     # empty short-term HLLs
     for key in self.shortCardDict.keys():
         self.shortCardDict[key] = HyperLogLog(16)
     return outliers
Esempio n. 22
0
class TestAdd(unittest.TestCase):

    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_add_string(self):
        try:
            self.hll.add('asdf')
        except Exception as ex:
            self.fail('failed to add string: %s' % ex)

    @unittest.skipIf(sys.version_info[0] > 2, 'buffer is deprecated in python 3.x')
    def test_add_buffer(self):
        try:
            self.hll.add(buffer('asdf'))
        except Exception as ex:
            self.fail('failed to add buffer: %s' % ex)

    def test_add_bytes(self):
        try:
            self.hll.add(b'asdf')
        except Exception as ex:
            self.fail('failed to add bytes: %s' % ex)
Esempio n. 23
0
class TestAdd(unittest.TestCase):
    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_add_string(self):
        try:
            self.hll.add('asdf')
        except Exception as ex:
            self.fail('failed to add string: %s' % ex)

    @unittest.skipIf(sys.version_info[0] > 2,
                     'buffer is deprecated in python 3.x')
    def test_add_buffer(self):
        try:
            self.hll.add(buffer('asdf'))
        except Exception as ex:
            self.fail('failed to add buffer: %s' % ex)

    def test_add_bytes(self):
        try:
            self.hll.add(b'asdf')
        except Exception as ex:
            self.fail('failed to add bytes: %s' % ex)
 def __init__(self, sigmaCount=5, period=3600, topN=10):
     super().__init__(sigmaCount, period=period)
     self.topN = topN
     self.shortCardDict = defaultdict(lambda: HyperLogLog(16))
     self.stdevDict = defaultdict(Stdev)
     self.totalCount = 0
Esempio n. 25
0
 def test_all_registers_initialized_to_zero(self):
     hll = HyperLogLog(5)
     registers = hll.registers()
     for register in registers:
         self.assertEqual(register, 0)
Esempio n. 26
0
 def test_k_param_determines_the_number_of_registers(self):
     hll = HyperLogLog(5)
     self.assertEqual(len(hll.registers()), 32)
     self.assertEqual(hll.size(), 32)
Esempio n. 27
0
 def test_all_registers_initialized_to_zero(self):
     hll = HyperLogLog(5)
     registers = hll.registers()
     for register in registers:
         self.assertEqual(register, 0)
Esempio n. 28
0
 def test_only_same_size_HyperLogLogs_can_be_merged(self):
     hll = HyperLogLog(4)
     hll2 = HyperLogLog(5)
     with self.assertRaises(ValueError):
         hll.merge(hll2)
Esempio n. 29
0
 def setUp(self):
     self.hll = HyperLogLog(5)
Esempio n. 30
0
class TestRegisterFunctions(unittest.TestCase):

    def setUp(self):
        self.k = 5
        self.hll = HyperLogLog(5)

    def test_set_last_register(self):
        self.hll.set_register(self.k - 1, 1)
        self.assertTrue(self.hll.registers()[self.k - 1] == 1)

    def test_set_first_register(self):
        self.hll.set_register(0, 1)
        self.assertTrue(self.hll.registers()[0] == 1)

    def test_set_register_with_negative_value_fails(self):
        with self.assertRaises(ValueError):
            self.hll.set_register(0, -1)

    def test_set_register_with_greater_than_max_rank_fails(self):
        with self.assertRaises(ValueError):
            self.hll.set_register(0, 33)

    def test_set_register_with_index_out_of_bounds(self):
        with self.assertRaises(IndexError):
            self.hll.set_register(32, 1)

    def test_set_register_with_negative_index_fails(self):
        with self.assertRaises(ValueError):
            self.hll.set_register(0, -1)

    def test_bytesarray_has_correct_values(self):
        expected = bytearray(32)
        for i in range(31):
            expected[i] = randint(0, 16)

        for i in range(31):
            self.hll.set_register(i, expected[i])

        registers = self.hll.registers()
        for i in range(31):
            self.assertEqual(expected[i], registers[i])

    def test_registers_returns_bytesarray(self):
        self.assertTrue(type(self.hll.registers()) is bytearray)

    def test_bytesarray_has_correct_length(self):
        self.assertTrue(len(self.hll.registers()) == pow(2, self.k))
Esempio n. 31
0
 def test_one_is_invalid_size(self):
     with self.assertRaises(ValueError):
         HyperLogLog(0)
Esempio n. 32
0
 def test_k_param_determines_the_number_of_registers(self):
     hll = HyperLogLog(5)
     self.assertEqual(len(hll.registers()), 32)
     self.assertEqual(hll.size(), 32)
Esempio n. 33
0
    def test_seed_parameter_sets_seed(self):
        hll = HyperLogLog(5, seed=4)
        self.assertEqual(hll.seed(), 4)

        hll2 = HyperLogLog(5, seed=2)
        self.assertNotEqual(hll.murmur3_hash('test'), hll2.murmur3_hash('test'))
Esempio n. 34
0
class TestRegisterFunctions(unittest.TestCase):
    def setUp(self):
        self.k = 5
        self.hll = HyperLogLog(5)

    def test_set_last_register(self):
        self.hll.set_register(self.k - 1, 1)
        self.assertTrue(self.hll.registers()[self.k - 1] == 1)

    def test_set_first_register(self):
        self.hll.set_register(0, 1)
        self.assertTrue(self.hll.registers()[0] == 1)

    def test_set_register_with_negative_value_fails(self):
        with self.assertRaises(ValueError):
            self.hll.set_register(0, -1)

    def test_set_register_with_greater_than_max_rank_fails(self):
        with self.assertRaises(ValueError):
            self.hll.set_register(0, 33)

    def test_set_register_with_index_out_of_bounds(self):
        with self.assertRaises(IndexError):
            self.hll.set_register(32, 1)

    def test_set_register_with_negative_index_fails(self):
        with self.assertRaises(ValueError):
            self.hll.set_register(0, -1)

    def test_bytesarray_has_correct_values(self):
        expected = bytearray(randint(0, 16) for x in range(32))
        for i in range(32):
            self.hll.set_register(i, expected[i])

        registers = self.hll.registers()
        self.assertEqual(expected, registers)

    def test_registers_returns_bytesarray(self):
        self.assertTrue(type(self.hll.registers()) is bytearray)

    def test_bytesarray_has_correct_length(self):
        self.assertTrue(len(self.hll.registers()) == pow(2, self.k))

    def test_set_registers(self):
        expected = bytearray(randint(0, 16) for x in range(32))
        self.hll.set_registers(expected)

        registers = self.hll.registers()
        self.assertEqual(expected, registers)
Esempio n. 35
0
from HLL import HyperLogLog
from HLL import get_SHA1_bin

#probamos la codificacion de sha1
print 'El codigo SHA1 de "hola" es '
print get_SHA1_bin('hola')

a = HyperLogLog(2000000, 0.05)
b = HyperLogLog(2000000, 0.05)
c = HyperLogLog(2000000, 0.05)

for i in xrange(100000):
    a.add(str(i))
for i in xrange(1500):
    b.add(str(i))
for i in xrange(100000, 200000):
    c.add(str(i))

print "1-100000 elementos aleatorios ingresados - Conteo estimado: ", a.getNumberEstimate(
)
print "1500 elementos aleatorios ingresados - Conteo estimado: ", b.getNumberEstimate(
)
print "100000-200000 elementos aleatorios ingresados - Conteo estimado: ", c.getNumberEstimate(
)
Esempio n. 36
0
 def test_maximum_size_is_valid(self):
     try:
         HyperLogLog(16)
     except Exception:
         self.fail()
Esempio n. 37
0
 def setUp(self):
     self.hll = HyperLogLog(5)
Esempio n. 38
0
 def test_greater_than_the_maximum_size_is_invalid(self):
     with self.assertRaises(ValueError):
         HyperLogLog(17)
Esempio n. 39
0
 def test_negative_size_is_invalid(self):
     with self.assertRaises(ValueError):
         HyperLogLog(-1)
Esempio n. 40
0
    def test_merge(self):
        expected = bytearray(4)
        expected[0] = 1
        expected[3] = 1

        hll = HyperLogLog(2)
        hll2 = HyperLogLog(2)

        hll.set_register(0, 1)
        hll2.set_register(3, 1)

        hll.merge(hll2)
        self.assertEqual(hll.registers(), expected)