Esempio n. 1
0
 def _hll_init_agg(v: pd.DataFrame) -> bytes:
     hll_res = HyperLogLog(k)
     hll = HyperLogLog(k)
     for x in v:
         if isinstance(x, (bytes, bytearray)):
             hll.set_registers(bytearray(x))
             hll_res.merge(hll)
         elif x is not None:
             hll_res.add(str(x))
     return hll_res.registers()
Esempio n. 2
0
class TestAdd(unittest.TestCase):
    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_add_string(self):
        try:
            self.hll.add('asdf')
        except Exception as ex:
            self.fail('failed to add string: %s' % ex)

    @unittest.skipIf(sys.version_info[0] > 2,
                     'buffer is deprecated in python 3.x')
    def test_add_buffer(self):
        try:
            self.hll.add(buffer('asdf'))
        except Exception as ex:
            self.fail('failed to add buffer: %s' % ex)

    def test_add_bytes(self):
        try:
            self.hll.add(b'asdf')
        except Exception as ex:
            self.fail('failed to add bytes: %s' % ex)

    def test_cardinality_estimate_changed_return(self):
        changed = self.hll.add('asdf')
        self.assertTrue(changed)
        changed = self.hll.add('asdf')
        self.assertFalse(changed)
        changed = self.hll.add('otherdata')
        self.assertTrue(changed)
Esempio n. 3
0
class ProbabilisticCounter(object):
    # error_rate: 1% = 0.01, 0.5% = 0.005 (min 0.005)
    def __init__(self, error_rate=0.005):
        self.error_rate = error_rate

        # error_rate = 1.04 / sqrt(m)
        # m = 2 ** p -> registers count
        # M(1)... M(m) = 0 -> registers

        p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2)))
        self.hll = HyperLogLog(p)

    # returns: True - value is new, False, value already included
    def add(self, value):
        return self.hll.add(value)

    def count(self):
        return math.floor(self.hll.cardinality())
Esempio n. 4
0
class TestAdd(unittest.TestCase):
    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_add_string(self):
        try:
            self.hll.add('asdf')
        except Exception as ex:
            self.fail('failed to add string: %s' % ex)

    @unittest.skipIf(sys.version_info[0] > 2,
                     'buffer is deprecated in python 3.x')
    def test_add_buffer(self):
        try:
            self.hll.add(buffer('asdf'))
        except Exception as ex:
            self.fail('failed to add buffer: %s' % ex)

    def test_add_bytes(self):
        try:
            self.hll.add(b'asdf')
        except Exception as ex:
            self.fail('failed to add bytes: %s' % ex)
Esempio n. 5
0
class TestAdd(unittest.TestCase):

    def setUp(self):
        self.hll = HyperLogLog(5)

    def test_add_string(self):
        try:
            self.hll.add('asdf')
        except Exception as ex:
            self.fail('failed to add string: %s' % ex)

    @unittest.skipIf(sys.version_info[0] > 2, 'buffer is deprecated in python 3.x')
    def test_add_buffer(self):
        try:
            self.hll.add(buffer('asdf'))
        except Exception as ex:
            self.fail('failed to add buffer: %s' % ex)

    def test_add_bytes(self):
        try:
            self.hll.add(b'asdf')
        except Exception as ex:
            self.fail('failed to add bytes: %s' % ex)
Esempio n. 6
0
from HLL import HyperLogLog
from HLL import get_SHA1_bin

#probamos la codificacion de sha1
print 'El codigo SHA1 de "hola" es '
print get_SHA1_bin('hola')

a = HyperLogLog(2000000, 0.05)
b = HyperLogLog(2000000, 0.05)
c = HyperLogLog(2000000, 0.05)

for i in xrange(100000):
    a.add(str(i))
for i in xrange(1500):
    b.add(str(i))
for i in xrange(100000, 200000):
    c.add(str(i))

print "1-100000 elementos aleatorios ingresados - Conteo estimado: ", a.getNumberEstimate(
)
print "1500 elementos aleatorios ingresados - Conteo estimado: ", b.getNumberEstimate(
)
print "100000-200000 elementos aleatorios ingresados - Conteo estimado: ", c.getNumberEstimate(
)
Esempio n. 7
0
class IpPortScanDetector(NetflowDetector):
    """ We want to find an IP address pair that's scanning a large number of
        ip address + port combinations. 
    
        (For example, at the 5-sigma level, if we want to avoid most false alarms.) 
        
        To do this, we create an HLL for each IP address, and add (ip+port)
        items to the HLL. (We also keep one for the total number of ip+port
        combinations we see.) We regularly check the cardinality for each
        IP address, and issue a warning when it's > 5 sigmas above 
        the expected number for all the HLLs.
    """
    __slots__ = ('cardinalityDict', 'totalCard', 'totalCount', 'topN')

    def __init__(self, sigmaCount=5, period=600, topN=10):
        super().__init__(sigmaCount, period=period)
        self.topN = topN
        self.cardinalityDict = defaultdict(lambda: HyperLogLog(16))
        self.totalCard = HyperLogLog(16)
        self.totalCount = 0

    def addNetflow(self, netflow):
        srcip = netflow.getSourceIpString()
        dst = netflow.getDestinationString()
        self.cardinalityDict[srcip].add(dst)
        self.totalCard.add(dst)
        self.totalCount += 1

    def getOutliersAll(self):
        """ must return a dict of (key, sigmas if > sigmaCount)
        """
        outliers = {}
        s = Stdev()
        for key, hll in self.cardinalityDict.items():
            cnt = hll.cardinality()
            s.add(cnt)
        mean = s.getMean()
        stdv = s.getStdev()
        for key, hll in self.cardinalityDict.items():
            cnt = hll.cardinality()
            if cnt > mean + self.sigmaCount * stdv:
                sigs = (cnt - mean) / stdv
                outliers[key] = sigs
        return outliers

    def logOutput(self, key, result):
        """ key: an item being tracked
            result: bool -- True if starting above sigmaCount, False if ending above it.
        """
        dt = timestampToDatetime(self.lastTimestamp)
        if result:
            print(
                "%s ::: IP address %s became an outlier for IP port scanning."
                % (dt, key))
        else:
            print(
                "%s ::: IP address %s is no longer an outlier for IP port scanning."
                % (dt, key))

    def getOutliers(self):
        """ must return a dict of (key, sigmas > sigmaCount)
        """
        outliers = {}
        s = Stdev()
        h = []
        topN = self.topN
        for key, hll in self.cardinalityDict.items():
            cnt = hll.cardinality()
            s.add(cnt)
            if len(h) < topN:
                heapq.heappush(h, (cnt, key))
            else:
                heapq.heappushpop(h, (cnt, key))
        mean = s.getMean()
        stdv = s.getStdev()
        for cnt, key in h:
            if cnt > mean + self.sigmaCount * stdv:
                sigs = (cnt - mean) / stdv
                outliers[key] = sigs
        for key in self.timePeriodMap.getActives():
            cnt = hll.cardinality()
            if cnt > mean + self.sigmaCount * stdv:
                sigs = (cnt - mean) / stdv
                outliers[key] = sigs
        return outliers

    def getCardinalities(self):
        return sorted(hll.cardinality()
                      for hll in self.cardinalityDict.values())
Esempio n. 8
0
from HLL import HyperLogLog
from generate_rand import gen

hll = HyperLogLog(5)  # use 2^5 registers

for i in gen():
    print(i)
hll.add('some data')

estimate = hll.cardinality()