def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
def __init__(self, error_rate=0.005): self.error_rate = error_rate # error_rate = 1.04 / sqrt(m) # m = 2 ** p -> registers count # M(1)... M(m) = 0 -> registers p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) self.hll = HyperLogLog(p)
def _hll_init(v): hll = HyperLogLog(k) zero = hll.registers() def regs(x): hll.set_registers(zero); if x is not None: hll.add(str(x)); return hll.registers() return v.apply(lambda x: regs(x))
def _hll_merge(v): hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: hll.set_registers(bytearray(x)) hll_res.merge(hll) return hll_res.registers()
class TestAdd(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_add_string(self): try: self.hll.add('asdf') except Exception as ex: self.fail('failed to add string: %s' % ex) @unittest.skipIf(sys.version_info[0] > 2, 'buffer is deprecated in python 3.x') def test_add_buffer(self): try: self.hll.add(buffer('asdf')) except Exception as ex: self.fail('failed to add buffer: %s' % ex) def test_add_bytes(self): try: self.hll.add(b'asdf') except Exception as ex: self.fail('failed to add bytes: %s' % ex) def test_cardinality_estimate_changed_return(self): changed = self.hll.add('asdf') self.assertTrue(changed) changed = self.hll.add('asdf') self.assertFalse(changed) changed = self.hll.add('otherdata') self.assertTrue(changed)
def _hll_merge(v: pd.DataFrame) -> bytes: hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: hll.set_registers(bytearray(x)) hll_res.merge(hll) return hll_res.registers()
def _stats_from_json(json: Dict[str, Any]) -> Dict[str, Any]: stats = {} if 'messages_sent' in json: stats['messages_sent'] = json['messages_sent'] if 'messages_received' in json: stats['messages_received'] = json['messages_received'] if 'users_active' in json: hll = HyperLogLog(12) hll.set_registers(bytearray(json['users_active'])) stats['users_active'] = hll return stats
def getOutliers(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} s = Stdev() h = [] topN = self.topN for key, hll in self.shortCardDict.items(): shortCount = hll.cardinality() s.add(shortCount) if len(h) < topN: heapq.heappush( h, (shortCount, key) ) else: heapq.heappushpop( h, (shortCount, key) ) mean = s.getMean() stdv = s.getStdev() for cnt, key in h: if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs for key in self.timePeriodMap.getActives(): cnt = hll.cardinality() if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs # empty short-term HLLs for key in self.shortCardDict.keys(): self.shortCardDict[key] = HyperLogLog(16) return outliers
def test_merge(self): expected = bytearray(4) expected[0] = 1 expected[3] = 1 hll = HyperLogLog(2) hll2 = HyperLogLog(2) hll.set_register(0, 1) hll2.set_register(3, 1) hll.merge(hll2) self.assertEqual(hll.registers(), expected)
def _hll_count(v: pd.Series) -> pd.Series: hll = HyperLogLog(k) def count(hll, x): hll.set_registers(bytearray(x)) return int(hll.cardinality()) return v.apply(lambda x: count(hll, x))
def setUp(self): hlls = [HyperLogLog(x, randint(1, 1000)) for x in range(4, 16)] cardinalities = [x**5 for x in range(1, 16)] for hll, n in zip(hlls, cardinalities): for i in range(1, n): hll.add(str(i)) self.hlls = hlls
def test_seed_parameter_sets_seed(self): hll = HyperLogLog(5, seed=4) self.assertEqual(hll.seed(), 4) hll2 = HyperLogLog(5, seed=2) self.assertNotEqual(hll.murmur3_hash('test'), hll2.murmur3_hash('test'))
class TestCardinalityEstimation(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_small_range_correction_all_registers_set_to_zero(self): self.assertEqual(self.hll.cardinality(), 0.0) def test_small_range_correction_not_all_registers_set_to_zero(self): self.hll.set_register(0, 1) c = self.hll.cardinality() correction = 1.46571806761 <= c and c <= 1.46571806762 self.assertTrue(correction) def test_medium_range_no_correction(self): for i in range(32): self.hll.set_register(i, 2) c = self.hll.cardinality() no_correction = 89.216 <= c and c <= 89.217 self.assertTrue(no_correction) @unittest.skip("correction value needs to be re-computed") def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
class TestCardinalityEstimation(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_small_range_correction_all_registers_set_to_zero(self): self.assertEqual(self.hll.cardinality(), 0.0) def test_small_range_correction_not_all_registers_set_to_zero(self): self.hll.set_register(0, 1) c = self.hll.cardinality() correction= 1.46571806761 <= c and c <= 1.46571806762 self.assertTrue(correction) def test_medium_range_no_correction(self): for i in range(32): self.hll.set_register(i, 2) c = self.hll.cardinality() no_correction = 89.216 <= c and c <= 89.217 self.assertTrue(no_correction) @unittest.skip("correction value needs to be re-computed") def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
class ProbabilisticCounter(object): # error_rate: 1% = 0.01, 0.5% = 0.005 (min 0.005) def __init__(self, error_rate=0.005): self.error_rate = error_rate # error_rate = 1.04 / sqrt(m) # m = 2 ** p -> registers count # M(1)... M(m) = 0 -> registers p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) self.hll = HyperLogLog(p) # returns: True - value is new, False, value already included def add(self, value): return self.hll.add(value) def count(self): return math.floor(self.hll.cardinality())
def __init__(self, sigmaCount=5, period=86400, tolerance=0.001): super().__init__(sigmaCount, period=period) self.longCardDict = defaultdict(lambda: HyperLogLog(16)) self.prevLongCard = {} self.slopeDict = defaultdict(lambda: ILS()) #self.slopeDict = defaultdict(lambda: SlopeWindow()) self.avgDict = defaultdict(lambda: Stdev()) #self.avgDict = defaultdict(lambda: AverageWindow()) self.totalCount = 0 self.updatePeriod = 0 self.tolerance = tolerance self.frozenHosts = set() self.everFrozen = set() #HyperLogLog(16)
def harmonic_centrality(G, max_distance=6): harmonic = defaultdict(lambda: 0) t_steps_set = defaultdict(lambda: defaultdict(lambda: HyperLogLog(10))) for node in G.nodes_iter(): t_steps_set[node][0].add(str(node)) for distance in range(1, max_distance + 1): for node in G.nodes_iter(): t_steps_set[node][distance].merge(t_steps_set[node][distance - 1]) for next_node in networkx.all_neighbors(G, node): t_steps_set[node][distance].merge( t_steps_set[next_node][distance - 1]) harmonic[node] += ( t_steps_set[node][distance].cardinality() - t_steps_set[node][distance - 1].cardinality()) / distance return dict(harmonic)
def _collect(self, stat: str, user: User, client: Client) -> None: assert user is not None assert client is not None if self.by_client is None: self.by_client = {} bc = self.by_client client_id = self._get_client_id(client) if client_id not in bc: bc[client_id] = {} bhc = bc[client_id] if stat == 'users_active': if stat not in bhc: bhc[stat] = HyperLogLog(12) bhc[stat].add(user.email) else: if stat not in bhc: bhc[stat] = 0 bhc[stat] += 1
def getOutliers(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} topN = self.topN for key, hll in self.shortCardDict.items(): shortCount = hll.cardinality() prevMean = self.stdevDict[key].getMean() prevStdev = self.stdevDict[key].getStdev() if shortCount >= prevMean + prevStdev * self.sigmaCount: sigs = (shortCount - prevMean) / prevStdev outliers[key] = sigs # update stdevDict, while we've got the information to do so. self.stdevDict[key].add(shortCount) # empty short-term HLLs for key in self.shortCardDict.keys(): self.shortCardDict[key] = HyperLogLog(16) return outliers
def getOutliersAll(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} s = Stdev() for key, hll in self.shortCardDict.items(): cnt = hll.cardinality() s.add(cnt) mean = s.getMean() stdv = s.getStdev() for key, hll in self.cardinalityDict.items(): cnt = hll.cardinality() if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs # empty short-term HLLs for key in self.shortCardDict.keys(): self.shortCardDict[key] = HyperLogLog(16) return outliers
class TestAdd(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_add_string(self): try: self.hll.add('asdf') except Exception as ex: self.fail('failed to add string: %s' % ex) @unittest.skipIf(sys.version_info[0] > 2, 'buffer is deprecated in python 3.x') def test_add_buffer(self): try: self.hll.add(buffer('asdf')) except Exception as ex: self.fail('failed to add buffer: %s' % ex) def test_add_bytes(self): try: self.hll.add(b'asdf') except Exception as ex: self.fail('failed to add bytes: %s' % ex)
def __init__(self, sigmaCount=5, period=3600, topN=10): super().__init__(sigmaCount, period=period) self.topN = topN self.shortCardDict = defaultdict(lambda: HyperLogLog(16)) self.stdevDict = defaultdict(Stdev) self.totalCount = 0
def test_all_registers_initialized_to_zero(self): hll = HyperLogLog(5) registers = hll.registers() for register in registers: self.assertEqual(register, 0)
def test_k_param_determines_the_number_of_registers(self): hll = HyperLogLog(5) self.assertEqual(len(hll.registers()), 32) self.assertEqual(hll.size(), 32)
def test_only_same_size_HyperLogLogs_can_be_merged(self): hll = HyperLogLog(4) hll2 = HyperLogLog(5) with self.assertRaises(ValueError): hll.merge(hll2)
def setUp(self): self.hll = HyperLogLog(5)
class TestRegisterFunctions(unittest.TestCase): def setUp(self): self.k = 5 self.hll = HyperLogLog(5) def test_set_last_register(self): self.hll.set_register(self.k - 1, 1) self.assertTrue(self.hll.registers()[self.k - 1] == 1) def test_set_first_register(self): self.hll.set_register(0, 1) self.assertTrue(self.hll.registers()[0] == 1) def test_set_register_with_negative_value_fails(self): with self.assertRaises(ValueError): self.hll.set_register(0, -1) def test_set_register_with_greater_than_max_rank_fails(self): with self.assertRaises(ValueError): self.hll.set_register(0, 33) def test_set_register_with_index_out_of_bounds(self): with self.assertRaises(IndexError): self.hll.set_register(32, 1) def test_set_register_with_negative_index_fails(self): with self.assertRaises(ValueError): self.hll.set_register(0, -1) def test_bytesarray_has_correct_values(self): expected = bytearray(32) for i in range(31): expected[i] = randint(0, 16) for i in range(31): self.hll.set_register(i, expected[i]) registers = self.hll.registers() for i in range(31): self.assertEqual(expected[i], registers[i]) def test_registers_returns_bytesarray(self): self.assertTrue(type(self.hll.registers()) is bytearray) def test_bytesarray_has_correct_length(self): self.assertTrue(len(self.hll.registers()) == pow(2, self.k))
def test_one_is_invalid_size(self): with self.assertRaises(ValueError): HyperLogLog(0)
class TestRegisterFunctions(unittest.TestCase): def setUp(self): self.k = 5 self.hll = HyperLogLog(5) def test_set_last_register(self): self.hll.set_register(self.k - 1, 1) self.assertTrue(self.hll.registers()[self.k - 1] == 1) def test_set_first_register(self): self.hll.set_register(0, 1) self.assertTrue(self.hll.registers()[0] == 1) def test_set_register_with_negative_value_fails(self): with self.assertRaises(ValueError): self.hll.set_register(0, -1) def test_set_register_with_greater_than_max_rank_fails(self): with self.assertRaises(ValueError): self.hll.set_register(0, 33) def test_set_register_with_index_out_of_bounds(self): with self.assertRaises(IndexError): self.hll.set_register(32, 1) def test_set_register_with_negative_index_fails(self): with self.assertRaises(ValueError): self.hll.set_register(0, -1) def test_bytesarray_has_correct_values(self): expected = bytearray(randint(0, 16) for x in range(32)) for i in range(32): self.hll.set_register(i, expected[i]) registers = self.hll.registers() self.assertEqual(expected, registers) def test_registers_returns_bytesarray(self): self.assertTrue(type(self.hll.registers()) is bytearray) def test_bytesarray_has_correct_length(self): self.assertTrue(len(self.hll.registers()) == pow(2, self.k)) def test_set_registers(self): expected = bytearray(randint(0, 16) for x in range(32)) self.hll.set_registers(expected) registers = self.hll.registers() self.assertEqual(expected, registers)
from HLL import HyperLogLog from HLL import get_SHA1_bin #probamos la codificacion de sha1 print 'El codigo SHA1 de "hola" es ' print get_SHA1_bin('hola') a = HyperLogLog(2000000, 0.05) b = HyperLogLog(2000000, 0.05) c = HyperLogLog(2000000, 0.05) for i in xrange(100000): a.add(str(i)) for i in xrange(1500): b.add(str(i)) for i in xrange(100000, 200000): c.add(str(i)) print "1-100000 elementos aleatorios ingresados - Conteo estimado: ", a.getNumberEstimate( ) print "1500 elementos aleatorios ingresados - Conteo estimado: ", b.getNumberEstimate( ) print "100000-200000 elementos aleatorios ingresados - Conteo estimado: ", c.getNumberEstimate( )
def test_maximum_size_is_valid(self): try: HyperLogLog(16) except Exception: self.fail()
def test_greater_than_the_maximum_size_is_invalid(self): with self.assertRaises(ValueError): HyperLogLog(17)
def test_negative_size_is_invalid(self): with self.assertRaises(ValueError): HyperLogLog(-1)