def test_cms_check_min(self): ''' test checking number elements using min algorithm ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_min(self): """test checking number elements using min algorithm""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 255), 255) self.assertEqual(cms.add("this is another test", 189), 189) self.assertEqual(cms.add("this is also a test", 16), 16) self.assertEqual(cms.add("this is something to test", 5), 5) self.assertEqual(cms.check("this is something to test"), 5) self.assertEqual(cms.check("this is also a test"), 16) self.assertEqual(cms.check("this is another test"), 189) self.assertEqual(cms.check("this is a test"), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_mean_called(self): ''' test checking number elements using mean algorithm called out ''' cms = CountMinSketch(width=1000, depth=5) cms.query_type = 'mean' self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_mean_called_even(self): """test checking number elements using mean algorithm called out when the depth is an even number...""" cms = CountMinSketch(width=1000, depth=6) cms.query_type = "mean-min" self.assertEqual(cms.add("this is a test", 255), 255) self.assertEqual(cms.add("this is another test", 189), 189) self.assertEqual(cms.add("this is also a test", 16), 16) self.assertEqual(cms.add("this is something to test", 5), 5) self.assertEqual(cms.check("this is something to test"), 5) self.assertEqual(cms.check("this is also a test"), 16) self.assertEqual(cms.check("this is another test"), 189) self.assertEqual(cms.check("this is a test"), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_check_mean_called_even(self): ''' test checking number elements using mean algorithm called out when the depth is an even number... ''' cms = CountMinSketch(width=1000, depth=6) cms.query_type = 'mean-min' self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_min_val(self): ''' test when we come to the bottom of the 32 bit int (stop overflow) ''' too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.remove('this is a test', too_large) self.assertEqual(cms.check('this is a test'), INT32_T_MIN) self.assertEqual(cms.elements_added, INT64_T_MIN)
def test_cms_max_val(self): ''' test when we come to the top of the 32 bit int (stop overflow) ''' too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add('this is a test', too_large) self.assertEqual(cms.check('this is a test'), INT32_T_MAX) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_max_val(self): """test when we come to the top of the 32 bit int (stop overflow)""" too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", too_large) self.assertEqual(cms.check("this is a test"), INT32_T_MAX) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_clear(self): """test the clear functionality""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) self.assertEqual(cms.elements_added, 100) cms.clear() self.assertEqual(cms.elements_added, 0) self.assertEqual(cms.check("this is a test"), 0)
def test_cms_clear(self): ''' test the clear functionality ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) self.assertEqual(cms.elements_added, 100) cms.clear() self.assertEqual(cms.elements_added, 0) self.assertEqual(cms.check('this is a test'), 0)
def test_cms_join(self): """test standard count-min sketch join""" cms1 = CountMinSketch(width=1000, depth=5) cms2 = CountMinSketch(width=1000, depth=5) self.assertEqual(255, cms1.add("this is a test", 255)) self.assertEqual(189, cms1.add("this is another test", 189)) self.assertEqual(16, cms1.add("this is also a test", 16)) self.assertEqual(5, cms1.add("this is something to test", 5)) self.assertEqual(255, cms2.add("this is a test", 255)) self.assertEqual(189, cms2.add("this is another test", 189)) self.assertEqual(16, cms2.add("this is also a test", 16)) self.assertEqual(5, cms2.add("this is something to test", 5)) cms1.join(cms2) self.assertEqual(255 * 2, cms1.check("this is a test")) self.assertEqual(189 * 2, cms1.check("this is another test")) self.assertEqual(16 * 2, cms1.check("this is also a test")) self.assertEqual(5 * 2, cms1.check("this is something to test"))
def test_cms_join_underflow(self): """test count-min sketch underflow""" too_large = INT32_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.remove("this is a test", too_large // 2) cms.join(cms) self.assertEqual(INT32_T_MIN, cms.check("this is a test")) self.assertEqual(cms.elements_added, -too_large) cms.remove("this is a test 2 ", INT64_T_MAX // 2) cms.join(cms) self.assertEqual(cms.elements_added, INT64_T_MIN)
class CM4: def __init__(self, width=128): if width < 1: raise RuntimeError("bad width for cm4") self.cm4 = CountMinSketch(width, 4) self.keys = set() def add(self, key: str): self.cm4.add(key) self.keys.add(key) def estimate(self, key: str): return self.cm4.check(key) def reset(self): for key in self.keys.copy(): down = self.cm4.check(key) >> 1 & 9223372036854775807 # if down > 1, it will be half of the count if down == 0: down = 1 self.keys.discard(key) self.cm4.remove(key, down)
def test_cms_load(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=fobj.name) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check("this is a test"), 100)
def test_cms_load(self): ''' test loading a count-min sketch from file ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) cms.export(filename) md5_out = calc_file_md5(filename) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=filename) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check('this is a test'), 100) os.remove(filename)
def test_cms_load_diff_hash(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=fobj.name, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check("this is a test"), True) self.assertNotEqual(cms.hashes("this is a test"), cms2.hashes("this is a test"))
def test_cms_load_diff_hash(self): ''' test loading a count-min sketch from file ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) cms.export(filename) md5_out = calc_file_md5(filename) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=filename, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check('this is a test'), True) self.assertNotEqual(cms.hashes('this is a test'), cms2.hashes('this is a test')) os.remove(filename)
sketch_starttime = time.process_time() add1 = stock_freq_cms.add(stock_symbol) sketch_endtime = time.process_time() sketch_time = sketch_time + (sketch_endtime - sketch_starttime) if stock_symbol in stock_symbol_dist.keys(): stock_symbol_freq = stock_symbol_dist[stock_symbol] + 1 else: stock_symbol_freq = 1 stock_symbol_dist.update({stock_symbol: stock_symbol_freq}) for stock_symbol in list(stock_symbol_dist.keys()): sketch_starttime = time.process_time() stock_symbol_freq_cms = stock_freq_cms.check(stock_symbol) sketch_endtime = time.process_time() sketch_qrytime = sketch_qrytime + (sketch_endtime - sketch_starttime) stock_symbol_freq = stock_symbol_dist[stock_symbol] accuracy = 1 - abs(stock_symbol_freq-stock_symbol_freq_cms)/stock_symbol_freq if accuracy < 0: accuracy = 0 total_accuracy = total_accuracy + accuracy stock_symbol_file.write(stock_symbol + "," + str(no_of_record)) stock_symbol_file.write("," + str(width) + "," + str(depth)) stock_symbol_file.write("," + str(stock_symbol_freq) + "," + str(stock_symbol_freq_cms)) stock_symbol_file.write("," + str(accuracy) + "\n") symbol_count = len(stock_symbol_dist) avg_accuracy = total_accuracy / symbol_count stock_freq_cms.export(cms_filename)
dfStrPrWin = dfStrPr[start: end] # print("start:", start) # print("end:", end) # print(dfStrPrWin) for m , row in dfStrPrWin.iterrows(): mov = dfStrPrWin.loc[m, "movie"] #BQ1_1 movie frq counter if mov in movsFrq.index: movsFrq.loc[mov]['Frq'] = movsFrq.loc[mov]['Frq'] + 1 else: movsFrq.loc[mov] = [1] #BQ1_2 min-sketch if BQ1_2 == 1: mov_s = str(mov) cms.add(mov_s) movsFrqSkt.loc[mov] = cms.check(mov_s) start = start + strmStep end = end + 1000 n =n + strmStep movsFrq.to_csv(storeResults + "movie-counter.csv") #BQ1_3 Compare method accuracy if BQ1_3 == 1: movsFrq = movsFrq.sort_values('Frq', ascending=False) movsFrqSkt = movsFrqSkt.sort_values('Frq', ascending=False) rmse = mean_squared_error(movsFrq, movsFrqSkt) print("BQ1_3 Compare method accuracy, RMSE: ", rmse)
cms = CountMinSketch(width=1000, depth=4) # CMS defined # def create_message(value): # data = [] # date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # data.append((date, value)) # # data['input'] = value # # data['datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # tuples = [{'date':i[0], 'input': i[1]} for i in data] # # return str(data) # return tuples def main(): x = 0 while x < 20: # x < X limit to be able to run print statements below i = randint(0, 10) msg = str(i) #create_message(i) cms.add(msg) print(msg) time.sleep(1) x += 1 if __name__ == "__main__": main() print("frequency of 9 = ", cms.check('9')) #checks frequency of 9 print("frequency of 7 = ", cms.check('7')) print("frequency of 2 = ", cms.check('2')) cms.export('cms_synopsis') # export this to a file on pc
with open('data-streaming-project.data') as f: rows_number=sum(1 for line in f) data_df=pd.read_csv('data-streaming-project.data',encoding='utf-8',delimiter='\t',names=['user','movie', 'rating', 'timestamp'],header=None) df_movies=data_df['movie'] cms=CountMinSketch(width=200,depth=21) start_time2 = time.time() counter = 0 for i in range(0,1126747): counter+=1 cms.add(str(df_movies[i])) if counter%1000 == 0: print(sys.getsizeof(cms)) end_time2 = time.time() cms.check('592') print(sys.getsizeof(cms)) print("Total execution time: {}".format(end_time2 - start_time2))
class makerCache: def __init__(self, size): self.phase = 1 self.round = 1 self.size = size self.cache = [] self.clean_counter = 0 self.clean_set = [] for i in range(0, size): # init empty cache(list) self.cache.append(Node('-', RequestFile(0, 'txt', False))) self.model = train_model() self.cms = CountMinSketch(width=1000, depth=5) self.hashtable = {} # single bucket for heavy items self.miss_count = 0 def print(self): for i in range(0, self.size): print(self.cache[i].page_name, end=' ') print(' ') print(' ') def get_phase(self): return self.phase def isAllSlotTaken(self): for i in range(0, self.size): if self.cache[i].page_name == '-': return False return True def lookup(self, page_name): for i in range(0, self.size): if self.cache[i].page_name == page_name: return True # page in the cache return False # page not in cache def set_marked(self, page_name): for i in range(0, self.size): if self.cache[i].page_name == page_name: self.cache[i].makred = True def is_all_marked(self): for i in range(0, self.size): if self.cache[i].marked == False: return False return True def reset(self): for i in range(0, self.size): self.cache[i].marked = False self.phase = self.phase + 1 self.clean_counter = 0 # save current cache as the set of elements that are possibly stale in new phase def evict(self, slot_pos): self.cache[slot_pos].page_name = '-' self.cache[slot_pos].request_file.update_year = 0 self.cache[slot_pos].request_file.file_type = '-' self.cache[slot_pos].request_file.is_in_hompage = False self.cache[slot_pos].marked = False def fill_in(self, slot_pos, new_page_name, request_file): self.cache[slot_pos].page_name = new_page_name self.cache[ slot_pos].request_file.update_year = request_file.update_year self.cache[slot_pos].request_file.file_type = request_file.file_type self.cache[ slot_pos].request_file.is_in_hompage = request_file.is_in_homepage self.cache[slot_pos].marked = True file_df = file_to_dataframe(self.cache[slot_pos].request_file) is_heavy0 = is_heavy(self.model, file_df) if is_heavy0: if new_page_name in self.hashtable: self.hashtable[ new_page_name] = self.hashtable[new_page_name] + 1 else: self.hashtable[new_page_name] = 1 else: self.cms.add(new_page_name) def replace(self, slot_pos, new_page_name, request_file): self.evict(slot_pos) self.fill_in(slot_pos, new_page_name, request_file) def select_unmarked(self, mode): pos = 0 unmarked = [] unmarked_freq = [] for i in range(0, self.size): if self.cache[i].marked == False: print(self.cache[i]) unmarked.append(i) unmarked_freq.append(0) print('unmarked:', unmarked) # rand method if mode == 'random': # print('using random method to select unmarked') rand_pos = random.randint(0, len(unmarked) - 1) print('unmarked len =', len(unmarked)) print('rand_pos=', rand_pos) print('real pos =', unmarked[rand_pos]) return unmarked[rand_pos] elif mode == 'ml_oracle': # ml method # predict all unmarked element (predict time = frequency from count-min sketch) # select the lowest frequency unmarked element # print('using ml oracle method to select unmarked') for i in range(0, len(unmarked)): freq = 0 # data preprocess file_df = file_to_dataframe( self.cache[unmarked[i]].request_file) is_heavy0 = is_heavy(self.model, file_df) if is_heavy0: freq = self.hashtable[self.cache[unmarked[i]].page_name] # print('freq:', freq) unmarked_freq[i] = freq # get from hash table else: # not heavy : get from count min sketch freq = self.cms.check(self.cache[unmarked[i]].page_name) # print('freq:', freq) unmarked_freq[i] = freq print(unmarked) print(unmarked_freq) max = 0 max_pos = 0 for j in range(0, len(unmarked)): if unmarked[j] > max: max = unmarked[j] max_pos = j return max_pos def request_page(self, page_name, request_file, mode='random'): # print('mode = ', mode) # requested page in cache if self.lookup(page_name) == True: self.set_marked(page_name) # set its marked = true file_df = file_to_dataframe(request_file) is_heavy0 = is_heavy(self.model, file_df) if is_heavy0: if page_name in self.hashtable: self.hashtable[page_name] = self.hashtable[page_name] + 1 print('freq:', self.hashtable[page_name]) else: self.hashtable[page_name] = 1 else: self.cms.add(page_name) print('freq:', self.cms.check(page_name)) if self.is_all_marked() == True: print('cache is all marked, ready to reset') self.print() self.reset() return True # page not in cache, idle slot available # a miss occur self.miss_count += 1 if self.isAllSlotTaken() == False: for i in range(0, self.size): if self.cache[i].page_name == '-': self.replace(i, page_name, request_file) break if self.is_all_marked() == True: print('cache is all marked, ready to reset') self.print() self.reset() return False # page not in cache and no idle slot replace_pos = self.select_unmarked(mode) self.replace(replace_pos, page_name, request_file) if self.is_all_marked() == True: print('cache is all marked, ready to reset') self.print() self.reset() return False
def get_withtime_files(path, filter_threshold): raw_data_set = collections.defaultdict(list) count = 0 prefetching_list ={} ID_stack =[] for dir in path: dir_list = os.listdir(dir) time_stack =[] for filename in dir_list: data_path = dir+filename try: fp = open(data_path) features = fp.readlines() time_slide = [] last_same = link_pure_maker(features[0].split(' ')[3].split('/')) for num in range(len(features)): vec = features[num].split(' ') #delete the same pattern if vec[3][-4:-1] in IGNORE_PATTERN: pass else: buff_link_list = vec[3].split('/') link_cat = link_pure_maker(buff_link_list) if num == 0 or link_cat != last_same: last_same = link_cat tarray = time.localtime(int(vec[1])) # time_str_buff= str(tarray.tm_mon)+str(tarray.tm_mday)+str(tarray.tm_hour)+str(tarray.tm_min)+str(tarray.tm_sec) # time_str_buff= str(tarray.tm_hour)+' '+str(tarray.tm_min) time_str_buff=int((tarray.tm_hour*360+tarray.tm_min*60+tarray.tm_sec)/10) # time_str_buff=int((tarray.tm_hour*60+tarray.tm_min)/8) try: if int(vec[4])>0: raw_data_set[time_str_buff].append([str(filename.split('.')[0][3:]),link_cat,vec[4]]) except: pass ID_stack.append(int(str(filename.split('.')[0][3:]))) #[link, user_ID, time_hour] try: if int(vec[4]) > filter_threshold: if link_cat in prefetching_list: if vec[4]>prefetching_list[link_cat]: prefetching_list[link_cat]=vec[4] else: prefetching_list[link_cat]=vec[4] except: pass else: pass except: count+=1 #monitoring time_list = [] for key, cont in raw_data_set.items(): capacity = 0 cms = CountMinSketch(width=1000, depth=5) bloom = BloomFilter(max_elements=10000, error_rate=0.1) for i in raw_data_set[key]: bloom.add(i[1]) cms.add(i[1]) type_count=0 amount =0 for i,l in prefetching_list.items(): if i in bloom: capacity+=int(l)*100 amount+=1 type_count+= cms.check(i) if type_count==0: rep_eta = 0 else: rep_eta = amount/type_count # print(str(rep_eta)[:5]) time_list.append([key,capacity,rep_eta]) time_list.sort() a_list =[] for i in range(1,len(time_list)): a_list.append([time_list[i][0],time_list[i-1][1]-time_list[i][1],time_list[i][1],time_list[i][2]]) trigger_list=[] for i in a_list: if abs(i[1])>250000000: trigger_list.append([i[0],i[1]/i[2],i[2],i[3]]) obj_save(trigger_list, 'trigger_list_dir.txt') obj_save(time_list, 'time_list_dir.txt') print('激活点位', len(trigger_list)) print('总时间点位', len(time_list)) plt.title(' ') plt.xlabel('timestamp') plt.xticks(rotation=45) plt.ylabel('Loads') plt.plot([i[0] for i in time_list], [i[1] for i in time_list],'-',color='b',label='Prefetching Loads') plt.plot([i[0] for i in a_list], [i[1] for i in a_list],'-',color='r',label='fluctuation') plt.legend() plt.grid() plt.show() print(len(raw_data_set)) user_id = np.array(ID_stack) user_scale =np.max(user_id)+1 print('irregular_format:',count) print('Num of users: ',user_scale) return raw_data_set, len(raw_data_set), prefetching_list, user_scale