def test(): ''' basic testing method ''' print('build in memory check') cms = CountMinSketch(width=100000, depth=7) # add elements for i in range(100): tmp = 100 * (i + 1) cms.add(str(i), tmp) print(cms.check(str(0), 'min')) print(cms.check(str(0), 'mean')) print(cms.check(str(0), 'mean-min')) cms.export('./dist/py_test.cms') print('import from disk check') cmsf = CountMinSketch(filepath='./dist/py_test.cms') if cms.width != cmsf.width: print('width does not match!') if cms.depth != cmsf.depth: print('depth does not match!') print(cmsf.check(str(0), 'min')) print(cmsf.check(str(0), 'mean')) print(cmsf.check(str(0), 'mean-min')) try: print('\n\nTest invalid initialization') cms_ex = CountMinSketch() except SyntaxError as ex: print(ex)
def show_statistics(): while True: H1 = copy.deepcopy(H) distinctFlows1 = copy.deepcopy(distinctFlows) N1 = copy.deepcopy(N) depth = 10 width = 40000 hash_functions = [hash_function(i) for i in range(depth)] sketch1 = CountMinSketch(depth, width, hash_functions, M=N1) for fp_key in H1: ef = H1[fp_key][0] rf = H1[fp_key][1] df = H1[fp_key][2] sketch1.add(fp_key, rf + df + ef) time.sleep(1) sketch = CountMinSketch(depth, width, hash_functions, M=N) for fp_key in H: ef = H[fp_key][0] rf = H[fp_key][1] df = H[fp_key][2] sketch.add(fp_key, rf + df + ef) top_flows = get_top_flows(sketch, sketch1) system('clear') print " flow rate" for flow in top_flows: print "#" + str(flow[0]) + " :: " + str(flow[1]) + "b/s"
def simulate_sfcmon(df, num_chunks): print("#################### PROCESSING CHUNKS #########################") num_packets = df.shape[0] print("# Number of packets = {}".format(num_packets)) chunks = split(df, num_packets / num_chunks) results = {"packets": set(), "size": set()} for c in chunks: print("# Chunk Data: {}; {}".format(c.shape, c.index)) sketch_packets = CountMinSketch( 5436, 5) # table size=1000, hash functions=10 sketch_size = CountMinSketch(5436, 5) # table size=1000, hash functions=10 count_packets = 0 count_size = 0 for row in zip(c["SrcIP"], c["Size"]): flow_id = row[0] sketch_packets.add(flow_id) sketch_size.add(flow_id, value=row[1]) count_packets += 1 count_size += row[1] if count_packets > num_rows_to_start: hh_threshold_packets = count_packets * bound hh_threshold_size = count_size * bound if sketch_packets[flow_id] > hh_threshold_packets: results["packets"].add(flow_id) if sketch_size[flow_id] > hh_threshold_size: results["size"].add(flow_id) return results
def runner(input_file_name, config): # read the list of integers with open(input_file_name) as f: lst = list(map(lambda l: int(l.strip()), f.readlines())) m = math.ceil(math.e / config["eps"]) d = math.ceil(ln(1 / config["delta"])) # print(m, d) sketch = CountMinSketch(m, d) # measure the running time startTime = time.time() for num in lst: sketch.add(num) endTime = time.time() result = {} for num in lst: result[num] = sketch[num] return { "acc": result, "time": endTime - startTime, "space": 0, }
def runner(input_file_name, config): # read the list of integers # with open(input_file_name) as f: # lst = list(map(lambda l: int(l.strip()), f.readlines())) sketch = CountMinSketch(config["m"], config["d"]) lst = AxProf.zipfGenerator(config["n"], config["skew"], int(time.time() * 1000) % 2**32) # measure the running time startTime = time.time() for num in lst: sketch.add(num) endTime = time.time() actual_map = actual_count(lst) error_map = {} for num in set(lst): error_map[num] = abs(actual_map[num] - sketch[num]) return { "input": lst, "acc": error_map, "time": config["m"] * config["d"], # endTime - startTime, "space": 0, }
def update_statistics(): global N, H depth = 10 width = 40000 hash_functions = [hash_function(i) for i in range(depth)] sketch = CountMinSketch(depth, width, hash_functions, M=N) for fp_key in H: ef = H[fp_key][0] rf = H[fp_key][1] df = H[fp_key][2] sketch.add(fp_key, rf + df + ef) system('clear') flows_to_display = [] for flow in distinctFlows: flows_to_display.append((flow, sketch.query(flow))) for flow in H.keys(): flows_to_display.append((flow, sketch.query(flow))) top_flows = sorted(flows_to_display, key=lambda x: x[1], reverse=True)[0:20] for flow in top_flows: print flow print "Total flows:" + str(len(distinctFlows) + len(H.keys()))
def test_counts_overestimate(self): text = open(__file__).read() counter = Counter(text) sketch = CountMinSketch(10, 5) for x in text: sketch.add(x) for x in set(text): self.assertGreaterEqual(sketch[x], counter[x])
def test_simple_usage(self): N = 1000 sketch = CountMinSketch(10, 5) for _ in xrange(N): sketch.add("a") self.assertEqual(sketch.query("a"), N) self.assertEqual(sketch.query("b"), 0) self.assertEqual(len(sketch), N)
def aggregate_unit(self, data_block): # update time once per unit self.t += 1 # we use this to keep track of the current time unit # convert the data_block into a CM sketch accumulator = CMSketch(self.m, self.d) # add each hashtag in the data_block to the CM sketch # while this data is coming in, we maintain a separate # data structure with the exact frequencies that we can # query for exact frequencies. # with frequency 1 for each appearance # reset the present when we aggregate the whole thing self.present = CMSketch(self.m, self.d) # (data_block is the present) for data in data_block: accumulator.add(data, 1) # update present as we update the accumulator self.present.add(data, 1) self.ready = False # we update the whole structure with M_bar # we calculate l: # l = max over all i such that (t mod 2^i) == 0 # efficient -- takes log t time to find at worst def find_l(t): l = 0 if t == 0: return l while t % 2 == 0: l += 1 t = t/2 return l # go up to the index that is find_l + 1, or the max index # if find_l + 1 >= to it for i in range(min(find_l(self.t) + 1, self.n)): # now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j) # M_bar - M^j difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1)) # A = A + (1/2)^i difference self.aggregate_score = sketch_sum(self.aggregate_score, sketch_scalar_product(difference, pow(0.5, i))) # temporary storage T = deepcopy(accumulator) # aggregate into accumulator for next round accumulator = sketch_sum(accumulator, self.cm_sketch_list[i]) # set the value self.cm_sketch_list[i] = T # now we're ready to use CM-sketch values self.ready = True # reset the present now that we're done with one time block self.present = CMSketch(self.m, self.d)
def plot_error(lst, m, d, color=None, label=None): sketch = CountMinSketch(m, d) actual_map = actual_count(lst) for i in lst: sketch.add(i) errors = [] unique = set(lst) for i in unique: actual = actual_map[i] error = abs(actual - sketch[i]) errors.append(error) print(len(errors)) plot.hist(errors, bins=len(errors) // 100, color=color, label=label) plot.xlim(0, int(len(lst) * 0.1))
def simulate_rtp4mon(df, bound, pkts_to_start): print("# " + str(datetime.datetime.now()) + " - Begin simulation RTP4Mon...") results = {"packets": set(), "5t_packets": set()} sketch_packets = CountMinSketch(5436, 5) # table size=1000, hash functions=10 count_packets = 0 for row in zip(df["SrcIP"], df["DstIP"], df["SrcPort"], df["DstPort"], df["Proto"]): flow_id = row[0] five_flow_id = row sketch_packets.add(flow_id) count_packets += 1 if count_packets > pkts_to_start: hh_threshold_packets = count_packets * bound if sketch_packets[flow_id] > hh_threshold_packets: results["packets"].add(flow_id) results["5t_packets"].add(five_flow_id) print("# " + str(datetime.datetime.now()) + " - End simulation RTP4Mon...") return results
def worker(index, path): global counter """ :param index: the index of the dump this worker should work on. :return: """ print "Process %d start processing" % index with open("%s/wiki_0%s" % (path, index), "r") as f: batch = Counter() batch_limit = 10000 sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS) current = datetime.now().date() for line in f: # Extrat timestamp from header if line[:4] == "<doc": m = TIMESTEMP_RE.search(line) if m: current = datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%SZ").date() continue elif line[:5] == "</doc>": continue else: for pair in map(lambda word: (current, word.lower()), WORD_RE.findall(line)): batch[pair] += 1 if len(batch) > batch_limit: for key, count in batch.iteritems(): sketch.add(key, count) batch.clear() counter.value += 1 if counter.value % 10000 == 0: print "Processed %s lines" % counter.value for key, count in batch.iteritems(): sketch.add(key, count) batch.clear() print "Process %d finished" % index return sketch.get_matrix()
def build_countminsketch(ksup, w=1000, h=10): """returns a countminsketch object inserting all kmers from given KmerSupplier.""" sketch = CountMinSketch(w, h) for kmer in ksup.iterkmers(): sketch.add(kmer) return sketch
def test_syntax_sugar(self): sketch = CountMinSketch(10, 5) self.assertEqual(sketch.query("a"), sketch["a"]) sketch.add("a") self.assertEqual(sketch.query("a"), sketch["a"])
class History(object): def __init__(self, n, m, d): # time counter (update for each new unit) self.t = 0 # n is number of CM sketches self.n = n # m is size of array for each hash function self.m = m # d is number of hash functions self.d = d # present is a count-min sketch containing # sub-unit time counts of indexes. self.present = CMSketch(m, d) # ready is a t/f value to determine whether or not # to use the present as a score while the aggregate weighted score # is being computed self.ready = False # use a CM-sketch to keep track of aggregate weighted score # A = sum{j = 1 to log T} (M^j / 2^j) # (we add the present ourselves) # keep track of A at every time interval # initialized to zero self.aggregate_score = CMSketch(m, d) # n count-min sketches # we retain resolutions 1, 2, 4, ..., 2^n # move to next sketch (update curr_sketch) when # time unit filled = 2^i (its position in the list) self.cm_sketch_list = [] for i in range(n): self.cm_sketch_list.append(CMSketch(m, d)) def update_present_only(self, datum): self.ready = False # don't update the full time # this is a sub-unit update self.present.add(datum, 1) # data_block is a block of data, presented as an iterable object # the block of data consists of data that arrived in a single time unit # implements algorithm 2 from the paper # this structures maintains n CM-sketches, M0, M1, ..., Mn # M0 always holds [t-1, t] where t is current time # M1 always holds [t - tmod2 - 2, t - tmod2] # ... # Mn always holds [t - tmod(2^n) - 2^n, t - tmod(2^n)] # for t = 8, for example: # M0: [7, 8] # M1: [6, 8] # M2: [4, 8] # M3: [0, 8] # rest: 0 def aggregate_unit(self, data_block): # update time once per unit self.t += 1 # we use this to keep track of the current time unit # convert the data_block into a CM sketch accumulator = CMSketch(self.m, self.d) # add each hashtag in the data_block to the CM sketch # while this data is coming in, we maintain a separate # data structure with the exact frequencies that we can # query for exact frequencies. # with frequency 1 for each appearance # reset the present when we aggregate the whole thing self.present = CMSketch(self.m, self.d) # (data_block is the present) for data in data_block: accumulator.add(data, 1) # update present as we update the accumulator self.present.add(data, 1) self.ready = False # we update the whole structure with M_bar # we calculate l: # l = max over all i such that (t mod 2^i) == 0 # efficient -- takes log t time to find at worst def find_l(t): l = 0 if t == 0: return l while t % 2 == 0: l += 1 t = t/2 return l # go up to the index that is find_l + 1, or the max index # if find_l + 1 >= to it for i in range(min(find_l(self.t) + 1, self.n)): # now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j) # M_bar - M^j difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1)) # A = A + (1/2)^i difference self.aggregate_score = sketch_sum(self.aggregate_score, sketch_scalar_product(difference, pow(0.5, i))) # temporary storage T = deepcopy(accumulator) # aggregate into accumulator for next round accumulator = sketch_sum(accumulator, self.cm_sketch_list[i]) # set the value self.cm_sketch_list[i] = T # now we're ready to use CM-sketch values self.ready = True # reset the present now that we're done with one time block self.present = CMSketch(self.m, self.d) # we want to put these values into its own count-min sketch, (call it A) # updated in sync so as to not waste log T time summing # for each query. # this value will provide a key for our heap def query_slow(self, x): return self.present.query(x) + sum(pow(0.5, i) * self.cm_sketch_list[i].query(x) for i in range(self.n)) # using a CMSketch to keep track of the score # note that we stored the 'scores' we calculated in CM-sketch # therefore it will pick the minimum of these # this is exactly equivalent to doing the sum over the minimums since we added termwise # (used matrix addition and scalar multiplication) def query(self, x): if self.ready: return self.aggregate_score.query(x) else: # only if we're not ready return self.present.query(x) + self.aggregate_score.query(x)
def test_add_greater_than_one(self): sketch = CountMinSketch(10, 5) sketch.add("a", 123) self.assertEqual(sketch.query("a"), 123)
stock_trade_filename = source_data_dir + stock_etf + "_trade_" + time_interval + str(no_of_record) + '.csv' crossref_not_present_count = 0 crossref_present_count = 0 #print('CrossRef CMS Process Starts Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f"))) #CrossRef CMS Process Starts crossref_stock_trade_frq_cms = CountMinSketch(100000, 10) stock_trade_file = open(stock_trade_filename,"r") stock_trade_lines = csv.reader(stock_trade_file, delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True) next(stock_trade_file) #print('CrossRef CMS Create Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f"))) #CrossRef CMS Create for stock_trade_line in stock_trade_lines: stock_symbol = stock_trade_line[0].strip() add1 = crossref_stock_trade_frq_cms.add(stock_symbol) stock_trade_file.close() #print('CrossRef CMS Membership Check Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f"))) #CrossRef CMS Membership Check check_stock_symbol_file = open(check_stock_symbol_filename, "r") for stock_symbol_line in check_stock_symbol_file: stock_symbol = stock_symbol_line.strip() stock_frq = crossref_stock_trade_frq_cms.check(stock_symbol) if stock_frq == 0: crossref_not_present_count = crossref_not_present_count + 1 else: crossref_present_count = crossref_present_count + 1 check_stock_symbol_file.close() #CrossRef CMS Process Completed #print('CrossRef CMS Process Completed Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
stock_vol_apds = CountMinSketch(width, depth) stock_trade_record_count = 0 vol_sketch_time = 0 # add elements to sketch for stock_trade_line in stock_trade_lines: #print('stock_trade_record_count:',stock_trade_record_count,' no_of_record:',no_of_record) if stock_trade_record_count >= no_of_record: break stock_trade_record_count = stock_trade_record_count + 1 stock_symbol = stock_trade_line[0].strip() trade_date = stock_trade_line[1].strip() stock_vol = int(stock_trade_line[7].strip()) #stock_input_file.write(stock_symbol + "," + str(trade_date) + "," + str(stock_vol) + "\n") vol_sketch_starttime = time.process_time() apds_cmsadded = stock_vol_apds.add(stock_symbol,stock_vol) #print('stock_trade_record_count:',stock_trade_record_count, ' stock_symbol:',stock_symbol,' stock_vol:',stock_vol,' add1:',apds_cmsadded) vol_sketch_endtime = time.process_time() vol_sketch_time = vol_sketch_time + (vol_sketch_endtime - vol_sketch_starttime) #print(vol_sketch_time,vol_sketch_endtime,vol_sketch_starttime) if stock_symbol in stock_vol_dist.keys(): stock_symbol_freq = stock_freq_dist[stock_symbol] + 1 stock_symbol_vol = stock_vol_dist[stock_symbol] + stock_vol else: stock_symbol_freq = 1 stock_symbol_vol = stock_vol stock_freq_dist.update({stock_symbol: stock_symbol_freq}) stock_vol_dist.update({stock_symbol: stock_symbol_vol})
def CountMin_Sketch(stream, k, h): sketch = CountMinSketch(k, h) for e in stream: sketch.add(e) return sketch
tp += 1 else: tn += 1 if (g_it.target[i] == 1): coef = (part_minus + 1.0) / (part_plus + part_minus + 1.0) part_plus += 1 else: coef = (part_plus + 1.0) / (part_plus + part_minus + 1.0) part_minus += 1 tao = min(C, max(0.0, ((1.0 - g_it.target[i] * dot) * coef) / module)) if (tao > 0.0): for row, col in zip(rows, cols): ((row, col), ex[row, col]) #print col, ex[row,col] WCMS.add(col, g_it.target[i] * tao * ex[row, col]) #print "Correct prediction example",i, "pred", score, "target",g_it.target[i] if i % 50 == 0 and i != 0: #output performance statistics every 50 examples if (tn + fp) > 0: pos_part = float(fp) / (tn + fp) else: pos_part = 0 if (tp + fn) > 0: neg_part = float(fn) / (tp + fn) else: neg_part = 0 BER = 0.5 * (pos_part + neg_part) print "1-BER Window esempio ", i, (1.0 - BER)
#- # Frequency using CSK #- from countminsketch import CountMinSketch # Initialize # table size=1000, hash functions=10 ds = CountMinSketch(1000, 10) # Add ds.add(1) ds.add(2) ds.add(1) # Test assert ds[1] == 2 assert ds[2] == 1 assert ds[3] == 0
len(pickle.dumps(Heavyhitters))) #%%CountMinSketch #pip install countminsketch if necessary from countminsketch import CountMinSketch sketch20_1 = CountMinSketch(20000, 1) # table size=20000, hash functions=1 sketch40_1 = CountMinSketch(40000, 1) sketch40_2 = CountMinSketch(40000, 2) sketch20_2 = CountMinSketch(20000, 2) sketch10_3 = CountMinSketch(10000, 3) sketch50_4 = CountMinSketch(50000, 4) for index, row in ratings.iterrows(): sketch20_1.add(row['movieId']) sketch40_1.add(row['movieId']) sketch40_2.add(row['movieId']) sketch20_2.add(row['movieId']) sketch10_3.add(row['movieId']) sketch50_4.add(row['movieId']) print( "Using a table of size 10000 and 3 hash functions estimated frequency of \n movie 296 is: ", sketch10_3[296]) print() print("\n Memory size (table of size 10000 and 3 hash functions) in bytes= ", len(pickle.dumps(sketch10_3))) print() print(