def _hll_init_agg(v: pd.DataFrame) -> bytes: hll_res = HyperLogLog(k) hll = HyperLogLog(k) for x in v: if isinstance(x, (bytes, bytearray)): hll.set_registers(bytearray(x)) hll_res.merge(hll) elif x is not None: hll_res.add(str(x)) return hll_res.registers()
class TestAdd(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_add_string(self): try: self.hll.add('asdf') except Exception as ex: self.fail('failed to add string: %s' % ex) @unittest.skipIf(sys.version_info[0] > 2, 'buffer is deprecated in python 3.x') def test_add_buffer(self): try: self.hll.add(buffer('asdf')) except Exception as ex: self.fail('failed to add buffer: %s' % ex) def test_add_bytes(self): try: self.hll.add(b'asdf') except Exception as ex: self.fail('failed to add bytes: %s' % ex) def test_cardinality_estimate_changed_return(self): changed = self.hll.add('asdf') self.assertTrue(changed) changed = self.hll.add('asdf') self.assertFalse(changed) changed = self.hll.add('otherdata') self.assertTrue(changed)
class ProbabilisticCounter(object): # error_rate: 1% = 0.01, 0.5% = 0.005 (min 0.005) def __init__(self, error_rate=0.005): self.error_rate = error_rate # error_rate = 1.04 / sqrt(m) # m = 2 ** p -> registers count # M(1)... M(m) = 0 -> registers p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) self.hll = HyperLogLog(p) # returns: True - value is new, False, value already included def add(self, value): return self.hll.add(value) def count(self): return math.floor(self.hll.cardinality())
class TestAdd(unittest.TestCase): def setUp(self): self.hll = HyperLogLog(5) def test_add_string(self): try: self.hll.add('asdf') except Exception as ex: self.fail('failed to add string: %s' % ex) @unittest.skipIf(sys.version_info[0] > 2, 'buffer is deprecated in python 3.x') def test_add_buffer(self): try: self.hll.add(buffer('asdf')) except Exception as ex: self.fail('failed to add buffer: %s' % ex) def test_add_bytes(self): try: self.hll.add(b'asdf') except Exception as ex: self.fail('failed to add bytes: %s' % ex)
from HLL import HyperLogLog from HLL import get_SHA1_bin #probamos la codificacion de sha1 print 'El codigo SHA1 de "hola" es ' print get_SHA1_bin('hola') a = HyperLogLog(2000000, 0.05) b = HyperLogLog(2000000, 0.05) c = HyperLogLog(2000000, 0.05) for i in xrange(100000): a.add(str(i)) for i in xrange(1500): b.add(str(i)) for i in xrange(100000, 200000): c.add(str(i)) print "1-100000 elementos aleatorios ingresados - Conteo estimado: ", a.getNumberEstimate( ) print "1500 elementos aleatorios ingresados - Conteo estimado: ", b.getNumberEstimate( ) print "100000-200000 elementos aleatorios ingresados - Conteo estimado: ", c.getNumberEstimate( )
class IpPortScanDetector(NetflowDetector): """ We want to find an IP address pair that's scanning a large number of ip address + port combinations. (For example, at the 5-sigma level, if we want to avoid most false alarms.) To do this, we create an HLL for each IP address, and add (ip+port) items to the HLL. (We also keep one for the total number of ip+port combinations we see.) We regularly check the cardinality for each IP address, and issue a warning when it's > 5 sigmas above the expected number for all the HLLs. """ __slots__ = ('cardinalityDict', 'totalCard', 'totalCount', 'topN') def __init__(self, sigmaCount=5, period=600, topN=10): super().__init__(sigmaCount, period=period) self.topN = topN self.cardinalityDict = defaultdict(lambda: HyperLogLog(16)) self.totalCard = HyperLogLog(16) self.totalCount = 0 def addNetflow(self, netflow): srcip = netflow.getSourceIpString() dst = netflow.getDestinationString() self.cardinalityDict[srcip].add(dst) self.totalCard.add(dst) self.totalCount += 1 def getOutliersAll(self): """ must return a dict of (key, sigmas if > sigmaCount) """ outliers = {} s = Stdev() for key, hll in self.cardinalityDict.items(): cnt = hll.cardinality() s.add(cnt) mean = s.getMean() stdv = s.getStdev() for key, hll in self.cardinalityDict.items(): cnt = hll.cardinality() if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs return outliers def logOutput(self, key, result): """ key: an item being tracked result: bool -- True if starting above sigmaCount, False if ending above it. """ dt = timestampToDatetime(self.lastTimestamp) if result: print( "%s ::: IP address %s became an outlier for IP port scanning." % (dt, key)) else: print( "%s ::: IP address %s is no longer an outlier for IP port scanning." % (dt, key)) def getOutliers(self): """ must return a dict of (key, sigmas > sigmaCount) """ outliers = {} s = Stdev() h = [] topN = self.topN for key, hll in self.cardinalityDict.items(): cnt = hll.cardinality() s.add(cnt) if len(h) < topN: heapq.heappush(h, (cnt, key)) else: heapq.heappushpop(h, (cnt, key)) mean = s.getMean() stdv = s.getStdev() for cnt, key in h: if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs for key in self.timePeriodMap.getActives(): cnt = hll.cardinality() if cnt > mean + self.sigmaCount * stdv: sigs = (cnt - mean) / stdv outliers[key] = sigs return outliers def getCardinalities(self): return sorted(hll.cardinality() for hll in self.cardinalityDict.values())
from HLL import HyperLogLog from generate_rand import gen hll = HyperLogLog(5) # use 2^5 registers for i in gen(): print(i) hll.add('some data') estimate = hll.cardinality()