def _hll_merge(v): hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: hll.set_registers(bytearray(x)) hll_res.merge(hll) return hll_res.registers()
def test_seed_parameter_sets_seed(self): hll = HyperLogLog(5, seed=4) self.assertEqual(hll.seed(), 4) hll2 = HyperLogLog(5, seed=2) self.assertNotEqual(hll.murmur3_hash('test'), hll2.murmur3_hash('test'))
def _hll_merge(v: pd.DataFrame) -> bytes: hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: hll.set_registers(bytearray(x)) hll_res.merge(hll) return hll_res.registers()
def _hll_init_agg(v: pd.DataFrame) -> bytes: hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: if isinstance(x, (bytes, bytearray)): hll.set_registers(bytearray(x)) hll_res.merge(hll) elif x is not None: hll_res.add(str(x)) return hll_res.registers()
def test_merge(self): expected = bytearray(4) expected[0] = 1 expected[3] = 1 hll = HyperLogLog(2) hll2 = HyperLogLog(2) hll.set_register(0, 1) hll2.set_register(3, 1) hll.merge(hll2) self.assertEqual(hll.registers(), expected)
def getOutliers(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} s = Stdev() h = [] topN = self.topN for key, hll in self.shortCardDict.items(): shortCount = hll.cardinality() s.add(shortCount) if len(h) < topN: heapq.heappush( h, (shortCount, key) ) else: heapq.heappushpop( h, (shortCount, key) ) mean = s.getMean() stdv = s.getStdev() for cnt, key in h: if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs for key in self.timePeriodMap.getActives(): cnt = hll.cardinality() if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs # empty short-term HLLs for key in self.shortCardDict.keys(): self.shortCardDict[key] = HyperLogLog(16) return outliers
def _hll_count(v: pd.Series) -> pd.Series: hll = HyperLogLog(k) def count(hll, x): hll.set_registers(bytearray(x)) return int(hll.cardinality()) return v.apply(lambda x: count(hll, x))
def setUp(self): hlls = [HyperLogLog(x, randint(1, 1000)) for x in range(4, 16)] cardinalities = [x**5 for x in range(1, 16)] for hll, n in zip(hlls, cardinalities): for i in range(1, n): hll.add(str(i)) self.hlls = hlls
def test_large_range_correction(self): hll = HyperLogLog(16) for i in range(hll.size() - 1): hll.set_register(i, 16) c = hll.cardinality() correction = 7916284520 <= c and c <= 7916284521 self.assertTrue(correction)
def _hll_init(v): hll = HyperLogLog(k) zero = hll.registers() def regs(x): hll.set_registers(zero); if x is not None: hll.add(str(x)); return hll.registers() return v.apply(lambda x: regs(x))
def __init__(self, error_rate=0.005): self.error_rate = error_rate # error_rate = 1.04 / sqrt(m) # m = 2 ** p -> registers count # M(1)... M(m) = 0 -> registers p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) self.hll = HyperLogLog(p)
def _stats_from_json(json: Dict[str, Any]) -> Dict[str, Any]: stats = {} if 'messages_sent' in json: stats['messages_sent'] = json['messages_sent'] if 'messages_received' in json: stats['messages_received'] = json['messages_received'] if 'users_active' in json: hll = HyperLogLog(12) hll.set_registers(bytearray(json['users_active'])) stats['users_active'] = hll return stats
def __init__(self, sigmaCount=5, period=86400, tolerance=0.001): super().__init__(sigmaCount, period=period) self.longCardDict = defaultdict(lambda: HyperLogLog(16)) self.prevLongCard = {} self.slopeDict = defaultdict(lambda: ILS()) #self.slopeDict = defaultdict(lambda: SlopeWindow()) self.avgDict = defaultdict(lambda: Stdev()) #self.avgDict = defaultdict(lambda: AverageWindow()) self.totalCount = 0 self.updatePeriod = 0 self.tolerance = tolerance self.frozenHosts = set() self.everFrozen = set() #HyperLogLog(16)
def harmonic_centrality(G, max_distance=6): harmonic = defaultdict(lambda: 0) t_steps_set = defaultdict(lambda: defaultdict(lambda: HyperLogLog(10))) for node in G.nodes_iter(): t_steps_set[node][0].add(str(node)) for distance in range(1, max_distance + 1): for node in G.nodes_iter(): t_steps_set[node][distance].merge(t_steps_set[node][distance - 1]) for next_node in networkx.all_neighbors(G, node): t_steps_set[node][distance].merge( t_steps_set[next_node][distance - 1]) harmonic[node] += ( t_steps_set[node][distance].cardinality() - t_steps_set[node][distance - 1].cardinality()) / distance return dict(harmonic)
def getOutliers(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} topN = self.topN for key, hll in self.shortCardDict.items(): shortCount = hll.cardinality() prevMean = self.stdevDict[key].getMean() prevStdev = self.stdevDict[key].getStdev() if shortCount >= prevMean + prevStdev * self.sigmaCount: sigs = (shortCount - prevMean) / prevStdev outliers[key] = sigs # update stdevDict, while we've got the information to do so. self.stdevDict[key].add(shortCount) # empty short-term HLLs for key in self.shortCardDict.keys(): self.shortCardDict[key] = HyperLogLog(16) return outliers
def _collect(self, stat: str, user: User, client: Client) -> None: assert user is not None assert client is not None if self.by_client is None: self.by_client = {} bc = self.by_client client_id = self._get_client_id(client) if client_id not in bc: bc[client_id] = {} bhc = bc[client_id] if stat == 'users_active': if stat not in bhc: bhc[stat] = HyperLogLog(12) bhc[stat].add(user.email) else: if stat not in bhc: bhc[stat] = 0 bhc[stat] += 1
def getOutliersAll(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} s = Stdev() for key, hll in self.shortCardDict.items(): cnt = hll.cardinality() s.add(cnt) mean = s.getMean() stdv = s.getStdev() for key, hll in self.cardinalityDict.items(): cnt = hll.cardinality() if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs # empty short-term HLLs for key in self.shortCardDict.keys(): self.shortCardDict[key] = HyperLogLog(16) return outliers
def test_greater_than_the_maximum_size_is_invalid(self): with self.assertRaises(ValueError): HyperLogLog(17)
from HLL import HyperLogLog from generate_rand import gen hll = HyperLogLog(5) # use 2^5 registers for i in gen(): print(i) hll.add('some data') estimate = hll.cardinality()
def __init__(self, sigmaCount=5, period=600, topN=10): super().__init__(sigmaCount, period=period) self.topN = topN self.cardinalityDict = defaultdict(lambda: HyperLogLog(16)) self.totalCard = HyperLogLog(16) self.totalCount = 0
def test_one_is_invalid_size(self): with self.assertRaises(ValueError): HyperLogLog(0)
from HLL import HyperLogLog from HLL import get_SHA1_bin #probamos la codificacion de sha1 print 'El codigo SHA1 de "hola" es ' print get_SHA1_bin('hola') a = HyperLogLog(2000000, 0.05) b = HyperLogLog(2000000, 0.05) c = HyperLogLog(2000000, 0.05) for i in xrange(100000): a.add(str(i)) for i in xrange(1500): b.add(str(i)) for i in xrange(100000, 200000): c.add(str(i)) print "1-100000 elementos aleatorios ingresados - Conteo estimado: ", a.getNumberEstimate( ) print "1500 elementos aleatorios ingresados - Conteo estimado: ", b.getNumberEstimate( ) print "100000-200000 elementos aleatorios ingresados - Conteo estimado: ", c.getNumberEstimate( )
def test_negative_size_is_invalid(self): with self.assertRaises(ValueError): HyperLogLog(-1)
def __init__(self, sigmaCount=5, period=3600, topN=10): super().__init__(sigmaCount, period=period) self.topN = topN self.shortCardDict = defaultdict(lambda: HyperLogLog(16)) self.stdevDict = defaultdict(Stdev) self.totalCount = 0
def test_k_param_determines_the_number_of_registers(self): hll = HyperLogLog(5) self.assertEqual(len(hll.registers()), 32) self.assertEqual(hll.size(), 32)
def test_all_registers_initialized_to_zero(self): hll = HyperLogLog(5) registers = hll.registers() for register in registers: self.assertEqual(register, 0)
def test_only_same_size_HyperLogLogs_can_be_merged(self): hll = HyperLogLog(4) hll2 = HyperLogLog(5) with self.assertRaises(ValueError): hll.merge(hll2)
def setUp(self): self.hll = HyperLogLog(5)
def test_maximum_size_is_valid(self): try: HyperLogLog(16) except Exception: self.fail()