def test_cms_bytes(self): """test exporting a count-min sketch as bytes""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) md5_out = hashlib.md5(bytes(cms)).hexdigest() self.assertEqual(md5_out, md5_val)
def test_cms_join_mixed_types(self): """test count-min, count-mean, and count-meanmin joining""" cms = CountMinSketch(width=1000, depth=5) cmeans = CountMeanSketch(width=1000, depth=5) cmms = CountMeanMinSketch(width=1000, depth=5) cms.add("this is a test", 500) cmeans.add("this is another test", 500) cmms.add("this is yet another test", 500) cms.join(cmeans) self.assertTrue("this is a test" in cms) self.assertTrue("this is another test" in cms) self.assertFalse("this is yet another test" in cms) cmeans.join(cmms) self.assertFalse("this is a test" in cmeans) self.assertTrue("this is another test" in cmeans) self.assertTrue("this is yet another test" in cmeans) self.assertFalse("foobar" in cmeans) cmms.join(cms) self.assertTrue("this is a test" in cmms) self.assertTrue("this is another test" in cmms) self.assertTrue("this is yet another test" in cmms) self.assertFalse("this is yet another test!" in cmms)
def test_cms_max_val(self): ''' test when we come to the top of the 32 bit int (stop overflow) ''' too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add('this is a test', too_large) self.assertEqual(cms.check('this is a test'), INT32_T_MAX) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_add_single(self): ''' test the insertion of a single element at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test'), 1) self.assertEqual(cms.add('this is a test'), 2) self.assertEqual(cms.add('this is a test'), 3) self.assertEqual(cms.add('this is a test'), 4) self.assertEqual(cms.elements_added, 4)
def test_cms_add_mult(self): ''' test the insertion of multiple elements at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 4), 4) self.assertEqual(cms.add('this is a test', 4), 8) self.assertEqual(cms.add('this is a test', 4), 12) self.assertEqual(cms.add('this is a test', 4), 16) self.assertEqual(cms.elements_added, 16)
def test_cms_add_mult(self): """test the insertion of multiple elements at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 4), 4) self.assertEqual(cms.add("this is a test", 4), 8) self.assertEqual(cms.add("this is a test", 4), 12) self.assertEqual(cms.add("this is a test", 4), 16) self.assertEqual(cms.elements_added, 16)
def test_cms_add_single(self): """test the insertion of a single element at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test"), 1) self.assertEqual(cms.add("this is a test"), 2) self.assertEqual(cms.add("this is a test"), 3) self.assertEqual(cms.add("this is a test"), 4) self.assertEqual(cms.elements_added, 4)
def test_cms_max_val(self): """test when we come to the top of the 32 bit int (stop overflow)""" too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", too_large) self.assertEqual(cms.check("this is a test"), INT32_T_MAX) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_export(self): """test exporting a count-min sketch""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_cms_export(self): ''' test exporting a count-min sketch ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) cms.add('this is a test', 100) cms.export(filename) md5_out = calc_file_md5(filename) os.remove(filename) self.assertEqual(md5_out, md5_val)
def test_cms_frombytes(self): """test loading a count-min sketch from bytes""" cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) bytes_out = bytes(cms) cms2 = CountMinSketch.frombytes(bytes_out) self.assertEqual(bytes(cms2), bytes(cms)) self.assertEqual(cms2.width, 1000) self.assertEqual(cms2.depth, 5) self.assertEqual(cms2.check("this is a test"), 100)
def test_cms_join_overflow(self): """test count-min sketch overflow""" too_large = INT32_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", too_large // 2) cms.join(cms) self.assertEqual(INT32_T_MAX, cms.check("this is a test")) self.assertEqual(cms.elements_added, too_large) cms.add("this is a test 2 ", INT64_T_MAX // 2) cms.join(cms) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_check_min(self): """test checking number elements using min algorithm""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 255), 255) self.assertEqual(cms.add("this is another test", 189), 189) self.assertEqual(cms.add("this is also a test", 16), 16) self.assertEqual(cms.add("this is something to test", 5), 5) self.assertEqual(cms.check("this is something to test"), 5) self.assertEqual(cms.check("this is also a test"), 16) self.assertEqual(cms.check("this is another test"), 189) self.assertEqual(cms.check("this is a test"), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_min(self): ''' test checking number elements using min algorithm ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_mean_called(self): ''' test checking number elements using mean algorithm called out ''' cms = CountMinSketch(width=1000, depth=5) cms.query_type = 'mean' self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_mean_called_even(self): ''' test checking number elements using mean algorithm called out when the depth is an even number... ''' cms = CountMinSketch(width=1000, depth=6) cms.query_type = 'mean-min' self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_mean_called_even(self): """test checking number elements using mean algorithm called out when the depth is an even number...""" cms = CountMinSketch(width=1000, depth=6) cms.query_type = "mean-min" self.assertEqual(cms.add("this is a test", 255), 255) self.assertEqual(cms.add("this is another test", 189), 189) self.assertEqual(cms.add("this is also a test", 16), 16) self.assertEqual(cms.add("this is something to test", 5), 5) self.assertEqual(cms.check("this is something to test"), 5) self.assertEqual(cms.check("this is also a test"), 16) self.assertEqual(cms.check("this is another test"), 189) self.assertEqual(cms.check("this is a test"), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_remove_mult(self): """test the removal of multiple elements at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 16), 16) self.assertEqual(cms.elements_added, 16) self.assertEqual(cms.remove("this is a test", 4), 12) self.assertEqual(cms.elements_added, 12)
def test_cms_remove_mult(self): ''' test the removal of multiple elements at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 16), 16) self.assertEqual(cms.elements_added, 16) self.assertEqual(cms.remove('this is a test', 4), 12) self.assertEqual(cms.elements_added, 12)
def test_cms_remove_single(self): ''' test the removal of a single element at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 4), 4) self.assertEqual(cms.elements_added, 4) self.assertEqual(cms.remove('this is a test'), 3) self.assertEqual(cms.remove('this is a test'), 2) self.assertEqual(cms.elements_added, 2)
def test_cms_remove_single(self): """test the removal of a single element at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 4), 4) self.assertEqual(cms.elements_added, 4) self.assertEqual(cms.remove("this is a test"), 3) self.assertEqual(cms.remove("this is a test"), 2) self.assertEqual(cms.elements_added, 2)
def test_cms_clear(self): ''' test the clear functionality ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) self.assertEqual(cms.elements_added, 100) cms.clear() self.assertEqual(cms.elements_added, 0) self.assertEqual(cms.check('this is a test'), 0)
def test_cms_clear(self): """test the clear functionality""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) self.assertEqual(cms.elements_added, 100) cms.clear() self.assertEqual(cms.elements_added, 0) self.assertEqual(cms.check("this is a test"), 0)
def test_cms_str(self): ''' test the string representation of the count-min sketch ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) msg = ('Count-Min Sketch:\n' '\tWidth: 1000\n' '\tDepth: 5\n' '\tConfidence: 0.96875\n' '\tError Rate: 0.002\n' '\tElements Added: 100') self.assertEqual(str(cms), msg)
class CM4: def __init__(self, width=128): if width < 1: raise RuntimeError("bad width for cm4") self.cm4 = CountMinSketch(width, 4) self.keys = set() def add(self, key: str): self.cm4.add(key) self.keys.add(key) def estimate(self, key: str): return self.cm4.check(key) def reset(self): for key in self.keys.copy(): down = self.cm4.check(key) >> 1 & 9223372036854775807 # if down > 1, it will be half of the count if down == 0: down = 1 self.keys.discard(key) self.cm4.remove(key, down)
def test_cms_load(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=fobj.name) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check("this is a test"), 100)
def test_cms_load(self): ''' test loading a count-min sketch from file ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) cms.export(filename) md5_out = calc_file_md5(filename) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=filename) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check('this is a test'), 100) os.remove(filename)
def test_cms_load_diff_hash(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=fobj.name, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check("this is a test"), True) self.assertNotEqual(cms.hashes("this is a test"), cms2.hashes("this is a test"))
def test_cms_load_diff_hash(self): ''' test loading a count-min sketch from file ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) cms.export(filename) md5_out = calc_file_md5(filename) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=filename, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check('this is a test'), True) self.assertNotEqual(cms.hashes('this is a test'), cms2.hashes('this is a test')) os.remove(filename)
def test_cms_join(self): """test standard count-min sketch join""" cms1 = CountMinSketch(width=1000, depth=5) cms2 = CountMinSketch(width=1000, depth=5) self.assertEqual(255, cms1.add("this is a test", 255)) self.assertEqual(189, cms1.add("this is another test", 189)) self.assertEqual(16, cms1.add("this is also a test", 16)) self.assertEqual(5, cms1.add("this is something to test", 5)) self.assertEqual(255, cms2.add("this is a test", 255)) self.assertEqual(189, cms2.add("this is another test", 189)) self.assertEqual(16, cms2.add("this is also a test", 16)) self.assertEqual(5, cms2.add("this is something to test", 5)) cms1.join(cms2) self.assertEqual(255 * 2, cms1.check("this is a test")) self.assertEqual(189 * 2, cms1.check("this is another test")) self.assertEqual(16 * 2, cms1.check("this is also a test")) self.assertEqual(5 * 2, cms1.check("this is something to test"))
with open('data-streaming-project.data') as f: rows_number=sum(1 for line in f) data_df=pd.read_csv('data-streaming-project.data',encoding='utf-8',delimiter='\t',names=['user','movie', 'rating', 'timestamp'],header=None) df_movies=data_df['movie'] cms=CountMinSketch(width=200,depth=21) start_time2 = time.time() counter = 0 for i in range(0,1126747): counter+=1 cms.add(str(df_movies[i])) if counter%1000 == 0: print(sys.getsizeof(cms)) end_time2 = time.time() cms.check('592') print(sys.getsizeof(cms)) print("Total execution time: {}".format(end_time2 - start_time2))
cms_filename = proj_dir + "cms_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_freq.cms" stock_symbol_dist = {} stock_freq_cms = CountMinSketch(width, depth) stock_trade_record_count = 0 sketch_time = 0 sketch_qrytime = 0 total_accuracy = 0 for stock_trade_line in stock_trade_lines: stock_trade_record_count = stock_trade_record_count + 1 if stock_trade_record_count > no_of_record: break stock_symbol = stock_trade_line[0].strip() sketch_starttime = time.process_time() add1 = stock_freq_cms.add(stock_symbol) sketch_endtime = time.process_time() sketch_time = sketch_time + (sketch_endtime - sketch_starttime) if stock_symbol in stock_symbol_dist.keys(): stock_symbol_freq = stock_symbol_dist[stock_symbol] + 1 else: stock_symbol_freq = 1 stock_symbol_dist.update({stock_symbol: stock_symbol_freq}) for stock_symbol in list(stock_symbol_dist.keys()): sketch_starttime = time.process_time() stock_symbol_freq_cms = stock_freq_cms.check(stock_symbol) sketch_endtime = time.process_time() sketch_qrytime = sketch_qrytime + (sketch_endtime - sketch_starttime)
while n < len(dfStrPr): dfStrPrWin = dfStrPr[start: end] # print("start:", start) # print("end:", end) # print(dfStrPrWin) for m , row in dfStrPrWin.iterrows(): mov = dfStrPrWin.loc[m, "movie"] #BQ1_1 movie frq counter if mov in movsFrq.index: movsFrq.loc[mov]['Frq'] = movsFrq.loc[mov]['Frq'] + 1 else: movsFrq.loc[mov] = [1] #BQ1_2 min-sketch if BQ1_2 == 1: mov_s = str(mov) cms.add(mov_s) movsFrqSkt.loc[mov] = cms.check(mov_s) start = start + strmStep end = end + 1000 n =n + strmStep movsFrq.to_csv(storeResults + "movie-counter.csv") #BQ1_3 Compare method accuracy if BQ1_3 == 1: movsFrq = movsFrq.sort_values('Frq', ascending=False) movsFrqSkt = movsFrqSkt.sort_values('Frq', ascending=False) rmse = mean_squared_error(movsFrq, movsFrqSkt) print("BQ1_3 Compare method accuracy, RMSE: ", rmse)
return reservoir #%% freq_data = top10freq(ip_data) # try different sizes sampled_data1 = reservoir_sampling(ip_data, 30000) freq_data1 = top10freq(sampled_data1) sampled_data2 = reservoir_sampling(ip_data, 100000) freq_data2 = top10freq(sampled_data2) sampled_data3 = reservoir_sampling(ip_data, 500000) freq_data3 = top10freq(sampled_data3) sampled_data4 = reservoir_sampling(ip_data, 1000000) freq_data4 = top10freq(sampled_data4) sampled_data5 = reservoir_sampling(ip_data, 2000000) freq_data5 = top10freq(sampled_data5) #%% CMS import time start_time = time.time() # count the compute time from probables import (CountMinSketch) cms = CountMinSketch(width=500, depth=100) cms.clear for i in ip_data: cms.add(i) time_spend = time.time() - start_time # cms.error_rate cms.elements_added
class makerCache: def __init__(self, size): self.phase = 1 self.round = 1 self.size = size self.cache = [] self.clean_counter = 0 self.clean_set = [] for i in range(0, size): # init empty cache(list) self.cache.append(Node('-', RequestFile(0, 'txt', False))) self.model = train_model() self.cms = CountMinSketch(width=1000, depth=5) self.hashtable = {} # single bucket for heavy items self.miss_count = 0 def print(self): for i in range(0, self.size): print(self.cache[i].page_name, end=' ') print(' ') print(' ') def get_phase(self): return self.phase def isAllSlotTaken(self): for i in range(0, self.size): if self.cache[i].page_name == '-': return False return True def lookup(self, page_name): for i in range(0, self.size): if self.cache[i].page_name == page_name: return True # page in the cache return False # page not in cache def set_marked(self, page_name): for i in range(0, self.size): if self.cache[i].page_name == page_name: self.cache[i].makred = True def is_all_marked(self): for i in range(0, self.size): if self.cache[i].marked == False: return False return True def reset(self): for i in range(0, self.size): self.cache[i].marked = False self.phase = self.phase + 1 self.clean_counter = 0 # save current cache as the set of elements that are possibly stale in new phase def evict(self, slot_pos): self.cache[slot_pos].page_name = '-' self.cache[slot_pos].request_file.update_year = 0 self.cache[slot_pos].request_file.file_type = '-' self.cache[slot_pos].request_file.is_in_hompage = False self.cache[slot_pos].marked = False def fill_in(self, slot_pos, new_page_name, request_file): self.cache[slot_pos].page_name = new_page_name self.cache[ slot_pos].request_file.update_year = request_file.update_year self.cache[slot_pos].request_file.file_type = request_file.file_type self.cache[ slot_pos].request_file.is_in_hompage = request_file.is_in_homepage self.cache[slot_pos].marked = True file_df = file_to_dataframe(self.cache[slot_pos].request_file) is_heavy0 = is_heavy(self.model, file_df) if is_heavy0: if new_page_name in self.hashtable: self.hashtable[ new_page_name] = self.hashtable[new_page_name] + 1 else: self.hashtable[new_page_name] = 1 else: self.cms.add(new_page_name) def replace(self, slot_pos, new_page_name, request_file): self.evict(slot_pos) self.fill_in(slot_pos, new_page_name, request_file) def select_unmarked(self, mode): pos = 0 unmarked = [] unmarked_freq = [] for i in range(0, self.size): if self.cache[i].marked == False: print(self.cache[i]) unmarked.append(i) unmarked_freq.append(0) print('unmarked:', unmarked) # rand method if mode == 'random': # print('using random method to select unmarked') rand_pos = random.randint(0, len(unmarked) - 1) print('unmarked len =', len(unmarked)) print('rand_pos=', rand_pos) print('real pos =', unmarked[rand_pos]) return unmarked[rand_pos] elif mode == 'ml_oracle': # ml method # predict all unmarked element (predict time = frequency from count-min sketch) # select the lowest frequency unmarked element # print('using ml oracle method to select unmarked') for i in range(0, len(unmarked)): freq = 0 # data preprocess file_df = file_to_dataframe( self.cache[unmarked[i]].request_file) is_heavy0 = is_heavy(self.model, file_df) if is_heavy0: freq = self.hashtable[self.cache[unmarked[i]].page_name] # print('freq:', freq) unmarked_freq[i] = freq # get from hash table else: # not heavy : get from count min sketch freq = self.cms.check(self.cache[unmarked[i]].page_name) # print('freq:', freq) unmarked_freq[i] = freq print(unmarked) print(unmarked_freq) max = 0 max_pos = 0 for j in range(0, len(unmarked)): if unmarked[j] > max: max = unmarked[j] max_pos = j return max_pos def request_page(self, page_name, request_file, mode='random'): # print('mode = ', mode) # requested page in cache if self.lookup(page_name) == True: self.set_marked(page_name) # set its marked = true file_df = file_to_dataframe(request_file) is_heavy0 = is_heavy(self.model, file_df) if is_heavy0: if page_name in self.hashtable: self.hashtable[page_name] = self.hashtable[page_name] + 1 print('freq:', self.hashtable[page_name]) else: self.hashtable[page_name] = 1 else: self.cms.add(page_name) print('freq:', self.cms.check(page_name)) if self.is_all_marked() == True: print('cache is all marked, ready to reset') self.print() self.reset() return True # page not in cache, idle slot available # a miss occur self.miss_count += 1 if self.isAllSlotTaken() == False: for i in range(0, self.size): if self.cache[i].page_name == '-': self.replace(i, page_name, request_file) break if self.is_all_marked() == True: print('cache is all marked, ready to reset') self.print() self.reset() return False # page not in cache and no idle slot replace_pos = self.select_unmarked(mode) self.replace(replace_pos, page_name, request_file) if self.is_all_marked() == True: print('cache is all marked, ready to reset') self.print() self.reset() return False
def get_withtime_files(path, filter_threshold): raw_data_set = collections.defaultdict(list) count = 0 prefetching_list ={} ID_stack =[] for dir in path: dir_list = os.listdir(dir) time_stack =[] for filename in dir_list: data_path = dir+filename try: fp = open(data_path) features = fp.readlines() time_slide = [] last_same = link_pure_maker(features[0].split(' ')[3].split('/')) for num in range(len(features)): vec = features[num].split(' ') #delete the same pattern if vec[3][-4:-1] in IGNORE_PATTERN: pass else: buff_link_list = vec[3].split('/') link_cat = link_pure_maker(buff_link_list) if num == 0 or link_cat != last_same: last_same = link_cat tarray = time.localtime(int(vec[1])) # time_str_buff= str(tarray.tm_mon)+str(tarray.tm_mday)+str(tarray.tm_hour)+str(tarray.tm_min)+str(tarray.tm_sec) # time_str_buff= str(tarray.tm_hour)+' '+str(tarray.tm_min) time_str_buff=int((tarray.tm_hour*360+tarray.tm_min*60+tarray.tm_sec)/10) # time_str_buff=int((tarray.tm_hour*60+tarray.tm_min)/8) try: if int(vec[4])>0: raw_data_set[time_str_buff].append([str(filename.split('.')[0][3:]),link_cat,vec[4]]) except: pass ID_stack.append(int(str(filename.split('.')[0][3:]))) #[link, user_ID, time_hour] try: if int(vec[4]) > filter_threshold: if link_cat in prefetching_list: if vec[4]>prefetching_list[link_cat]: prefetching_list[link_cat]=vec[4] else: prefetching_list[link_cat]=vec[4] except: pass else: pass except: count+=1 #monitoring time_list = [] for key, cont in raw_data_set.items(): capacity = 0 cms = CountMinSketch(width=1000, depth=5) bloom = BloomFilter(max_elements=10000, error_rate=0.1) for i in raw_data_set[key]: bloom.add(i[1]) cms.add(i[1]) type_count=0 amount =0 for i,l in prefetching_list.items(): if i in bloom: capacity+=int(l)*100 amount+=1 type_count+= cms.check(i) if type_count==0: rep_eta = 0 else: rep_eta = amount/type_count # print(str(rep_eta)[:5]) time_list.append([key,capacity,rep_eta]) time_list.sort() a_list =[] for i in range(1,len(time_list)): a_list.append([time_list[i][0],time_list[i-1][1]-time_list[i][1],time_list[i][1],time_list[i][2]]) trigger_list=[] for i in a_list: if abs(i[1])>250000000: trigger_list.append([i[0],i[1]/i[2],i[2],i[3]]) obj_save(trigger_list, 'trigger_list_dir.txt') obj_save(time_list, 'time_list_dir.txt') print('激活点位', len(trigger_list)) print('总时间点位', len(time_list)) plt.title(' ') plt.xlabel('timestamp') plt.xticks(rotation=45) plt.ylabel('Loads') plt.plot([i[0] for i in time_list], [i[1] for i in time_list],'-',color='b',label='Prefetching Loads') plt.plot([i[0] for i in a_list], [i[1] for i in a_list],'-',color='r',label='fluctuation') plt.legend() plt.grid() plt.show() print(len(raw_data_set)) user_id = np.array(ID_stack) user_scale =np.max(user_id)+1 print('irregular_format:',count) print('Num of users: ',user_scale) return raw_data_set, len(raw_data_set), prefetching_list, user_scale