Ejemplo n.º 1
0
def update_statistics():
    global N, H

    depth = 10
    width = 40000
    hash_functions = [hash_function(i) for i in range(depth)]
    sketch = CountMinSketch(depth, width, hash_functions, M=N)

    for fp_key in H:
        ef = H[fp_key][0]
        rf = H[fp_key][1]
        df = H[fp_key][2]
        sketch.add(fp_key, rf + df + ef)

    system('clear')
    flows_to_display = []
    for flow in distinctFlows:
        flows_to_display.append((flow, sketch.query(flow)))
    for flow in H.keys():
        flows_to_display.append((flow, sketch.query(flow)))

    top_flows = sorted(flows_to_display, key=lambda x: x[1],
                       reverse=True)[0:20]
    for flow in top_flows:
        print flow
    print "Total flows:" + str(len(distinctFlows) + len(H.keys()))
 def test_simple_usage(self):
     N = 1000
     sketch = CountMinSketch(10, 5)
     for _ in xrange(N):
         sketch.add("a")
     self.assertEqual(sketch.query("a"), N)
     self.assertEqual(sketch.query("b"), 0)
     self.assertEqual(len(sketch), N)
Ejemplo n.º 3
0
class History(object):
	def __init__(self, n, m, d):
		# time counter (update for each new unit)
		self.t = 0
		# n is number of CM sketches
		self.n = n 
		# m is size of array for each hash function
		self.m = m
		# d is number of hash functions
		self.d = d
		# present is a count-min sketch containing
		# sub-unit time counts of indexes. 
		self.present = CMSketch(m, d)
		# ready is a t/f value to determine whether or not
		# to use the present as a score while the aggregate weighted score
		# is being computed
		self.ready = False
		# use a CM-sketch to keep track of aggregate weighted score
		# A = sum{j = 1 to log T} (M^j / 2^j)
		# (we add the present ourselves)
		# keep track of A at every time interval
		# initialized to zero
		self.aggregate_score = CMSketch(m, d)
		# n count-min sketches 
		# we retain resolutions 1, 2, 4, ..., 2^n
		# move to next sketch (update curr_sketch) when 
		# time unit filled = 2^i (its position in the list)
		self.cm_sketch_list = []
		for i in range(n):
			self.cm_sketch_list.append(CMSketch(m, d))

	def update_present_only(self, datum):
		self.ready = False
		# don't update the full time
		# this is a sub-unit update
		self.present.add(datum, 1)

	# data_block is a block of data, presented as an iterable object
	# the block of data consists of data that arrived in a single time unit
	# implements algorithm 2 from the paper
	# this structures maintains n CM-sketches, M0, M1, ..., Mn
	# M0 always holds [t-1, t] where t is current time
	# M1 always holds [t - tmod2 - 2, t - tmod2]
	# ...
	# Mn always holds [t - tmod(2^n) - 2^n, t - tmod(2^n)]
	# for t = 8, for example:
	# M0: [7, 8]
	# M1: [6, 8]
	# M2: [4, 8]
	# M3: [0, 8]
	# rest: 0
	def aggregate_unit(self, data_block):
		# update time once per unit
		self.t += 1
		# we use this to keep track of the current time unit
		# convert the data_block into a CM sketch
		accumulator = CMSketch(self.m, self.d)
		# add each hashtag in the data_block to the CM sketch
		# while this data is coming in, we maintain a separate
		# data structure with the exact frequencies that we can
		# query for exact frequencies. 
		# with frequency 1 for each appearance

		# reset the present when we aggregate the whole thing
		self.present = CMSketch(self.m, self.d)
		# (data_block is the present)
		for data in data_block:
			accumulator.add(data, 1)
			# update present as we update the accumulator
			self.present.add(data, 1)

		self.ready = False
		# we update the whole structure with M_bar
		# we calculate l: 
		# l = max over all i such that (t mod 2^i) == 0
		# efficient -- takes log t time to find at worst
		def find_l(t):
			l = 0
			if t == 0:
				return l
			while t % 2 == 0:
				l += 1
				t = t/2
			return l

		# go up to the index that is find_l + 1, or the max index
		# if find_l + 1 >= to it
		for i in range(min(find_l(self.t) + 1, self.n)):
			# now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j)
			# M_bar - M^j
			difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1))
			# A = A + (1/2)^i difference
			self.aggregate_score = sketch_sum(self.aggregate_score, 
									sketch_scalar_product(difference, pow(0.5, i)))
			# temporary storage
			T = deepcopy(accumulator)
			# aggregate into accumulator for next round
			accumulator = sketch_sum(accumulator, self.cm_sketch_list[i])
			# set the value
			self.cm_sketch_list[i] = T
		# now we're ready to use CM-sketch values
		self.ready = True
		# reset the present now that we're done with one time block
		self.present = CMSketch(self.m, self.d)

	# we want to put these values into its own count-min sketch, (call it A)
	# updated in sync so as to not waste log T time summing
	# for each query.
	# this value will provide a key for our heap
	def query_slow(self, x):
		return self.present.query(x) + sum(pow(0.5, i) * self.cm_sketch_list[i].query(x) for i in range(self.n))

	# using a CMSketch to keep track of the score
	# note that we stored the 'scores' we calculated in CM-sketch
	# therefore it will pick the minimum of these
	# this is exactly equivalent to doing the sum over the minimums since we added termwise
	# (used matrix addition and scalar multiplication)
	def query(self, x):
		if self.ready:
			return self.aggregate_score.query(x)
		else: # only if we're not ready 
			return self.present.query(x) + self.aggregate_score.query(x)
 def test_zero_at_start(self):
     sketch = CountMinSketch(10, 5)
     for thing in (0, 1, -1, tuple, tuple(), "", "yeah", object()):
         self.assertEqual(sketch.query(thing), 0)
 def test_add_greater_than_one(self):
     sketch = CountMinSketch(10, 5)
     sketch.add("a", 123)
     self.assertEqual(sketch.query("a"), 123)
 def test_syntax_sugar(self):
     sketch = CountMinSketch(10, 5)
     self.assertEqual(sketch.query("a"), sketch["a"])
     sketch.add("a")
     self.assertEqual(sketch.query("a"), sketch["a"])