def test_add(self): s = SlidingHyperLogLog(0.05, 100) for i in range(10): s.add(i, str(i)) M = [(i, max(R for ts, R in lpfm)) for i, lpfm in enumerate(s.LPFM) if lpfm] self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
def test_pickle(self): a = SlidingHyperLogLog(0.05, 100) for i in xrange(10000): a.add(i, str('k1-%d' % i)) b = pickle.loads(pickle.dumps(a)) self.assertEqual(a.window, b.window) self.assertEqual(a.alpha, b.alpha) self.assertEqual(a.p, b.p) self.assertEqual(a.m, b.m) self.assertEqual(a.LPFM, b.LPFM)
def test_calc_cardinality_sliding1(self): a = SlidingHyperLogLog(0.05, 100) a.add(1, 'k1') self.assertEqual(int(a.card(1)), 1) self.assertEqual(int(a.card(101)), 1) self.assertEqual(int(a.card(102)), 0) a.add(2, 'k2') a.add(3, 'k3') self.assertEqual(int(a.card(3)), 3) self.assertEqual(int(a.card(101)), 3) self.assertEqual(int(a.card(102)), 2) self.assertEqual(int(a.card(103)), 1) self.assertEqual(int(a.card(104)), 0)
def test_init(self): s = SlidingHyperLogLog(0.05, 100) self.assertEqual(s.window, 100) self.assertEqual(s.p, 9) self.assertEqual(s.alpha, 0.7197831133217303) self.assertEqual(s.m, 512) self.assertEqual(len(s.LPFM), 512)
def __init__(self, downsample_rate=10000): self._lastcounts = None self._downsample_rate = downsample_rate self._stack_dist_counts = {} self._current_step = None self._shll = SlidingHyperLogLog(0.3, float("inf")) self._countmatrix = []
def test_calc_cardinality(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in xrange(n): a = SlidingHyperLogLog(rel_err, 100) for i in xrange(card): a.add(int(time.time()), os.urandom(20)) s += a.card(int(time.time())) z = (float(s) / n - card) / (rel_err * card / math.sqrt(n)) self.assertLess(-1.96, z) self.assertGreater(1.96, z)
def test_calc_cardinality(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in xrange(n): a = SlidingHyperLogLog(rel_err, 100) for i in xrange(card): a.add(int(time.time()), os.urandom(20)) s += a.card(int(time.time())) z = (float(s) / n - card) / (rel_err * card / math.sqrt(n)) self.assertLess(-3, z) self.assertGreater(3, z)
def test_calc_cardinality_sliding3(self): clist = [30, 60, 200, 1000, 10000, 60000] rel_err = 0.05 t1 = 0 t2 = 0 for card in clist: a = SlidingHyperLogLog(rel_err, card) for i in xrange(card): a.add(i, os.urandom(20)) ts = time.time() l1 = [a.card(1.5 * card, w / 10.0) for w in range(1, card + 1, card // 10)] t1 = (time.time() - ts) ts = time.time() l2 = a.card_wlist(1.5 * card, [ w / 10.0 for w in range(1, card + 1, card // 10)]) t2 = (time.time() - ts) # print card, t1, t2 self.assertEqual(l1, l2)
def test_calc_cardinality_sliding2(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in range(n): a = SlidingHyperLogLog(rel_err, 100) for i in range(card): a.add(i / 2000.0, os.urandom(20)) s += a.card(card / 2000.0) card_stored = min(card, 200000) z = (float(s) / n - card_stored) / (rel_err * card_stored / math.sqrt(n)) self.assertLess(-3, z) self.assertGreater(3, z)
def test_calc_cardinality_sliding2(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in xrange(n): a = SlidingHyperLogLog(rel_err, 100) for i in xrange(card): a.add(i / 2000.0, os.urandom(20)) s += a.card(card / 2000.0) card_stored = min(card, 200000) z = (float(s) / n - card_stored) / (rel_err * card_stored / math.sqrt(n)) self.assertLess(-3, z) self.assertGreater(3, z)
def test_from_list(self): s1 = SlidingHyperLogLog(0.05, 100) for i in range(10): s1.add(i, str(i)) s2 = SlidingHyperLogLog.from_list(s1.LPFM, 100) self.assertEqual(s1, s2) self.assertEqual(s1.card(9), s2.card(9)) self.assertEqual(s1.card_wlist(9, [100, 3, 5]), [s2.card(9, 100), s2.card(9, 3), s2.card(9, 5)])
def test_calc_cardinality_sliding3(self): clist = [30, 60, 200, 1000, 10000, 60000] rel_err = 0.05 t1 = 0 t2 = 0 for card in clist: a = SlidingHyperLogLog(rel_err, card) for i in xrange(card): a.add(i, os.urandom(20)) ts = time.time() l1 = [a.card(1.5 * card, w / 10.0) for w in range(1, card + 1, card / 10)] t1 = (time.time() - ts) ts = time.time() l2 = a.card_wlist(1.5 * card, [ w / 10.0 for w in range(1, card + 1, card / 10)]) t2 = (time.time() - ts) #print card, t1, t2 self.assertEqual(l1, l2)
def test_update_err(self): a = SlidingHyperLogLog(0.05, 100) b = SlidingHyperLogLog(0.01, 100) self.assertRaises(ValueError, a.update, b)
def test_update(self): a = SlidingHyperLogLog(0.05, 100) b = SlidingHyperLogLog(0.05, 100) c = SlidingHyperLogLog(0.05, 100) for i in xrange(10000): a.add(i, str('k1-%d' % i)) c.add(i, str('k1-%d' % i)) for i in xrange(10000): b.add(i, str('k2-%d' % i)) c.add(i, str('k2-%d' % i)) a.update(b) self.assertNotEqual(a, b) self.assertNotEqual(b, c) self.assertEqual(a, c)
class CounterStack(object): """Class that takes a stream of input symbols and keeps track of stack distances""" def __init__(self, downsample_rate=10000): self._lastcounts = None self._downsample_rate = downsample_rate self._stack_dist_counts = {} self._current_step = None self._shll = SlidingHyperLogLog(0.3, float("inf")) self._countmatrix = [] def process_sequence_symbol(self, symbol): # Current step starts at 0 self._current_step = 0 if self._current_step is None else self._current_step+1 # Symbol count starts at 1 self._symbol_count = self._current_step + 1 # Add the current symbol to the sliding HLL counter self._shll.add(self._current_step, symbol) # Sample the counters if this is an observable time step (if it is a multiple of "d", the downsample rate) if self.is_observable_time(): if self.is_empty(): self._lastcounts = np.zeros((1, 1)) else: # Make a new column vector containing the most recent unique count for every counter self._lastcounts = np.vstack((self._lastcounts, np.zeros((1, 1)))) # Get list of unique counts, given an interval (xrange) of windows. Stack as col vector new_counts = self._shll.card_wlist(self._symbol_count, xrange(1, self._current_step+self._downsample_rate, self._downsample_rate)) new_counts_column = np.array(new_counts)[::-1].reshape(-1,1) # Make a matrix containing only the last two rows countmatrix = np.c_[self._lastcounts, new_counts_column] self._countmatrix = countmatrix # Update the stack distance histogram # Compute differences between the last two columns delta_x = np.diff(countmatrix) # Compute change between the change in the counters (delta Y) delta_y = np.diff(np.r_[np.zeros((1, delta_x.shape[1])), delta_x], axis=0) # Set the last element in delta y to 1-delta_x (according to the algorithm) delta_y[-1,-1] = 1 - delta_x[-1,-1] delta_y_last_col = delta_y[:,-1:] # Not sure if this is needed anymore since delta_y is probably just 1 col now, but just to be safe c_last_col = countmatrix[:,-1:] # Go across all rows for row_i in xrange(delta_y.shape[0]): # Get the stack distance count from delta y stack_dist_count = delta_y_last_col.item(row_i, 0) # Get the stack distance from the counterstacks "matrix" (only need last two col of it) stack_dist = c_last_col.item(row_i, 0) # Only record stack distances that don't have a count of 0 (to save memory) if stack_dist_count == 0: continue # We record the stack distance/the downsample rate, 'd' (according to the algorithm) if np.ceil(stack_dist/float(self._downsample_rate)) not in self._stack_dist_counts: self._stack_dist_counts[np.ceil(stack_dist/float(self._downsample_rate))] = stack_dist_count else: self._stack_dist_counts[np.ceil(stack_dist/float(self._downsample_rate))] += stack_dist_count # Record the newest column of counter values for the next round self._lastcounts = new_counts_column def get_stack_distance_counts(self): # Sort bin/value pairs by bin number # Multiply bins by delta (downsample rate) according to algorithm bins, values = map(list, zip(*sorted(self._stack_dist_counts.items(), key=lambda t: t[0]))) bins = [x*self._downsample_rate for x in bins] return bins, values def is_observable_time(self): return self._current_step % self._downsample_rate == 0 def is_empty(self): return self._lastcounts is None or len(self._lastcounts) == 0 def total_size(self): return(sys.getsizeof(self._countmatrix))