Beispiel #1
0
    def test_add(self):
        s = SlidingHyperLogLog(0.05, 100)

        for i in range(10):
            s.add(i, str(i))

        M = [(i, max(R for ts, R in lpfm)) for i, lpfm in enumerate(s.LPFM) if lpfm]
        self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
Beispiel #2
0
    def test_from_list(self):
        s1 = SlidingHyperLogLog(0.05, 100)

        for i in range(10):
            s1.add(i, str(i))

        s2 = SlidingHyperLogLog.from_list(s1.LPFM, 100)
        self.assertEqual(s1, s2)
        self.assertEqual(s1.card(9), s2.card(9))
        self.assertEqual(s1.card_wlist(9, [100, 3, 5]), [s2.card(9, 100), s2.card(9, 3), s2.card(9, 5)])
Beispiel #3
0
 def test_pickle(self):
     a = SlidingHyperLogLog(0.05, 100)
     for i in xrange(10000):
         a.add(i, str('k1-%d' % i))
     
     b = pickle.loads(pickle.dumps(a))
     self.assertEqual(a.window, b.window)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
     self.assertEqual(a.LPFM, b.LPFM)
Beispiel #4
0
 def test_pickle(self):
     a = SlidingHyperLogLog(0.05, 100)
     for i in xrange(10000):
         a.add(i, str('k1-%d' % i))
     
     b = pickle.loads(pickle.dumps(a))
     self.assertEqual(a.window, b.window)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
     self.assertEqual(a.LPFM, b.LPFM)
Beispiel #5
0
 def test_calc_cardinality_sliding1(self):
     a = SlidingHyperLogLog(0.05, 100)
     a.add(1, 'k1')
     self.assertEqual(int(a.card(1)), 1)
     self.assertEqual(int(a.card(101)), 1)
     self.assertEqual(int(a.card(102)), 0)
     a.add(2, 'k2')
     a.add(3, 'k3')
     self.assertEqual(int(a.card(3)), 3)
     self.assertEqual(int(a.card(101)), 3)
     self.assertEqual(int(a.card(102)), 2)
     self.assertEqual(int(a.card(103)), 1)
     self.assertEqual(int(a.card(104)), 0)
Beispiel #6
0
    def test_calc_cardinality(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in xrange(n):
                a = SlidingHyperLogLog(rel_err, 100)

                for i in xrange(card):
                    a.add(int(time.time()), os.urandom(20))

                s += a.card(int(time.time()))

            z = (float(s) / n - card) / (rel_err * card / math.sqrt(n))
            self.assertLess(-3, z)
            self.assertGreater(3, z)
Beispiel #7
0
    def test_calc_cardinality_sliding3(self):
        clist = [30, 60, 200, 1000, 10000, 60000]
        rel_err = 0.05
        t1 = 0
        t2 = 0
        for card in clist:
            a = SlidingHyperLogLog(rel_err, card)

            for i in xrange(card):
                a.add(i, os.urandom(20))

            ts = time.time()
            l1 = [a.card(1.5 * card, w / 10.0) for w in range(1, card + 1, card // 10)]
            t1 = (time.time() - ts)
            ts = time.time()
            l2 = a.card_wlist(1.5 * card, [ w / 10.0 for w in range(1, card + 1, card // 10)])
            t2 = (time.time() - ts)
            # print card, t1, t2
            self.assertEqual(l1, l2)
Beispiel #8
0
    def test_calc_cardinality_sliding2(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in xrange(n):
                a = SlidingHyperLogLog(rel_err, 100)

                for i in xrange(card):
                    a.add(i / 2000.0, os.urandom(20))

                s += a.card(card / 2000.0)

            card_stored = min(card, 200000)
            z = (float(s) / n - card_stored) / (rel_err * card_stored / math.sqrt(n))
            self.assertLess(-3, z)
            self.assertGreater(3, z)
Beispiel #9
0
    def test_calc_cardinality_sliding2(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in range(n):
                a = SlidingHyperLogLog(rel_err, 100)

                for i in range(card):
                    a.add(i / 2000.0, os.urandom(20))

                s += a.card(card / 2000.0)

            card_stored = min(card, 200000)
            z = (float(s) / n - card_stored) / (rel_err * card_stored / math.sqrt(n))
            self.assertLess(-3, z)
            self.assertGreater(3, z)
Beispiel #10
0
    def test_update(self):
        a = SlidingHyperLogLog(0.05, 100)
        b = SlidingHyperLogLog(0.05, 100)
        c = SlidingHyperLogLog(0.05, 100)

        for i in xrange(10000):
            a.add(i, str('k1-%d' % i))
            c.add(i, str('k1-%d' % i))

        for i in xrange(10000):
            b.add(i, str('k2-%d' % i))
            c.add(i, str('k2-%d' % i))

        a.update(b)

        self.assertNotEqual(a, b)
        self.assertNotEqual(b, c)
        self.assertEqual(a, c)
Beispiel #11
0
    def test_update(self):
        a = SlidingHyperLogLog(0.05, 100)
        b = SlidingHyperLogLog(0.05, 100)
        c = SlidingHyperLogLog(0.05, 100)

        for i in xrange(10000):
            a.add(i, str('k1-%d' % i))
            c.add(i, str('k1-%d' % i))

        for i in xrange(10000):
            b.add(i, str('k2-%d' % i))
            c.add(i, str('k2-%d' % i))

        a.update(b)

        self.assertNotEqual(a, b)
        self.assertNotEqual(b, c)
        self.assertEqual(a, c)
Beispiel #12
0
class CounterStack(object):
	"""Class that takes a stream of input symbols and keeps track of stack distances"""
	def __init__(self, downsample_rate=10000):
		self._lastcounts = None
		self._downsample_rate = downsample_rate
		self._stack_dist_counts = {}
		self._current_step = None
		self._shll = SlidingHyperLogLog(0.3, float("inf"))
		self._countmatrix = []

	def process_sequence_symbol(self, symbol):
		# Current step starts at 0
		self._current_step = 0 if self._current_step is None else self._current_step+1
		# Symbol count starts at 1
		self._symbol_count = self._current_step + 1

		# Add the current symbol to the sliding HLL counter
		self._shll.add(self._current_step, symbol)

		# Sample the counters if this is an observable time step (if it is a multiple of "d", the downsample rate)
		if self.is_observable_time():
			if self.is_empty():
				self._lastcounts = np.zeros((1, 1))

			else:
				# Make a new column vector containing the most recent unique count for every counter
				self._lastcounts = np.vstack((self._lastcounts, np.zeros((1, 1))))

			# Get list of unique counts, given an interval (xrange) of windows. Stack as col vector
			new_counts = self._shll.card_wlist(self._symbol_count, xrange(1, self._current_step+self._downsample_rate, self._downsample_rate))
			new_counts_column = np.array(new_counts)[::-1].reshape(-1,1)

			# Make a matrix containing only the last two rows
			countmatrix = np.c_[self._lastcounts, new_counts_column]
			self._countmatrix = countmatrix

			# Update the stack distance histogram
			# Compute differences between the last two columns
			delta_x = np.diff(countmatrix)

			# Compute change between the change in the counters (delta Y)
			delta_y = np.diff(np.r_[np.zeros((1, delta_x.shape[1])), delta_x], axis=0)
			
			# Set the last element in delta y to 1-delta_x (according to the algorithm)
			delta_y[-1,-1] = 1 - delta_x[-1,-1]
			delta_y_last_col = delta_y[:,-1:] # Not sure if this is needed anymore since delta_y is probably just 1 col now, but just to be safe
			c_last_col = countmatrix[:,-1:]

			# Go across all rows
			for row_i in xrange(delta_y.shape[0]):
				# Get the stack distance count from delta y
				stack_dist_count = delta_y_last_col.item(row_i, 0)
				# Get the stack distance from the counterstacks "matrix" (only need last two col of it)
				stack_dist = c_last_col.item(row_i, 0)

				# Only record stack distances that don't have a count of 0 (to save memory)
				if stack_dist_count == 0:
					continue

				# We record the stack distance/the downsample rate, 'd' (according to the algorithm)
				if np.ceil(stack_dist/float(self._downsample_rate)) not in self._stack_dist_counts:
					self._stack_dist_counts[np.ceil(stack_dist/float(self._downsample_rate))] = stack_dist_count
				else:
					self._stack_dist_counts[np.ceil(stack_dist/float(self._downsample_rate))] += stack_dist_count
		
			# Record the newest column of counter values for the next round
			self._lastcounts = new_counts_column

	def get_stack_distance_counts(self):
		# Sort bin/value pairs by bin number
		# Multiply bins by delta (downsample rate) according to algorithm
		bins, values = map(list, zip(*sorted(self._stack_dist_counts.items(), key=lambda t: t[0])))
		bins = [x*self._downsample_rate for x in bins]
		return bins, values

	def is_observable_time(self):
		return self._current_step % self._downsample_rate == 0

	def is_empty(self):
		return self._lastcounts is None or len(self._lastcounts) == 0

	def total_size(self):
		return(sys.getsizeof(self._countmatrix))