def test_cms_bytes(self):
     """test exporting a count-min sketch as bytes"""
     md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
     cms = CountMinSketch(width=1000, depth=5)
     cms.add("this is a test", 100)
     md5_out = hashlib.md5(bytes(cms)).hexdigest()
     self.assertEqual(md5_out, md5_val)
    def test_cms_join_mixed_types(self):
        """test count-min, count-mean, and count-meanmin joining"""
        cms = CountMinSketch(width=1000, depth=5)
        cmeans = CountMeanSketch(width=1000, depth=5)
        cmms = CountMeanMinSketch(width=1000, depth=5)

        cms.add("this is a test", 500)
        cmeans.add("this is another test", 500)
        cmms.add("this is yet another test", 500)

        cms.join(cmeans)
        self.assertTrue("this is a test" in cms)
        self.assertTrue("this is another test" in cms)
        self.assertFalse("this is yet another test" in cms)

        cmeans.join(cmms)
        self.assertFalse("this is a test" in cmeans)
        self.assertTrue("this is another test" in cmeans)
        self.assertTrue("this is yet another test" in cmeans)
        self.assertFalse("foobar" in cmeans)

        cmms.join(cms)
        self.assertTrue("this is a test" in cmms)
        self.assertTrue("this is another test" in cmms)
        self.assertTrue("this is yet another test" in cmms)
        self.assertFalse("this is yet another test!" in cmms)
 def test_cms_max_val(self):
     ''' test when we come to the top of the 32 bit int
         (stop overflow) '''
     too_large = INT64_T_MAX + 5
     cms = CountMinSketch(width=1000, depth=5)
     cms.add('this is a test', too_large)
     self.assertEqual(cms.check('this is a test'), INT32_T_MAX)
     self.assertEqual(cms.elements_added, INT64_T_MAX)
 def test_cms_add_single(self):
     ''' test the insertion of a single element at a time '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test'), 1)
     self.assertEqual(cms.add('this is a test'), 2)
     self.assertEqual(cms.add('this is a test'), 3)
     self.assertEqual(cms.add('this is a test'), 4)
     self.assertEqual(cms.elements_added, 4)
 def test_cms_add_mult(self):
     ''' test the insertion of multiple elements at a time '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test', 4), 4)
     self.assertEqual(cms.add('this is a test', 4), 8)
     self.assertEqual(cms.add('this is a test', 4), 12)
     self.assertEqual(cms.add('this is a test', 4), 16)
     self.assertEqual(cms.elements_added, 16)
 def test_cms_add_mult(self):
     """test the insertion of multiple elements at a time"""
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add("this is a test", 4), 4)
     self.assertEqual(cms.add("this is a test", 4), 8)
     self.assertEqual(cms.add("this is a test", 4), 12)
     self.assertEqual(cms.add("this is a test", 4), 16)
     self.assertEqual(cms.elements_added, 16)
 def test_cms_add_single(self):
     """test the insertion of a single element at a time"""
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add("this is a test"), 1)
     self.assertEqual(cms.add("this is a test"), 2)
     self.assertEqual(cms.add("this is a test"), 3)
     self.assertEqual(cms.add("this is a test"), 4)
     self.assertEqual(cms.elements_added, 4)
 def test_cms_max_val(self):
     """test when we come to the top of the 32 bit int
     (stop overflow)"""
     too_large = INT64_T_MAX + 5
     cms = CountMinSketch(width=1000, depth=5)
     cms.add("this is a test", too_large)
     self.assertEqual(cms.check("this is a test"), INT32_T_MAX)
     self.assertEqual(cms.elements_added, INT64_T_MAX)
 def test_cms_export(self):
     """test exporting a count-min sketch"""
     md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
     with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
         cms = CountMinSketch(width=1000, depth=5)
         cms.add("this is a test", 100)
         cms.export(fobj.name)
         md5_out = calc_file_md5(fobj.name)
     self.assertEqual(md5_out, md5_val)
    def test_cms_export(self):
        ''' test exporting a count-min sketch '''
        md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449'
        filename = 'test.cms'
        cms = CountMinSketch(width=1000, depth=5)
        cms.add('this is a test', 100)
        cms.export(filename)
        md5_out = calc_file_md5(filename)
        os.remove(filename)

        self.assertEqual(md5_out, md5_val)
    def test_cms_frombytes(self):
        """test loading a count-min sketch from bytes"""
        cms = CountMinSketch(width=1000, depth=5)
        cms.add("this is a test", 100)
        bytes_out = bytes(cms)

        cms2 = CountMinSketch.frombytes(bytes_out)
        self.assertEqual(bytes(cms2), bytes(cms))
        self.assertEqual(cms2.width, 1000)
        self.assertEqual(cms2.depth, 5)
        self.assertEqual(cms2.check("this is a test"), 100)
Beispiel #12
0
    def test_cms_export(self):
        ''' test exporting a count-min sketch '''
        md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449'
        filename = 'test.cms'
        cms = CountMinSketch(width=1000, depth=5)
        cms.add('this is a test', 100)
        cms.export(filename)
        md5_out = calc_file_md5(filename)
        os.remove(filename)

        self.assertEqual(md5_out, md5_val)
    def test_cms_join_overflow(self):
        """test count-min sketch overflow"""
        too_large = INT32_T_MAX + 5
        cms = CountMinSketch(width=1000, depth=5)
        cms.add("this is a test", too_large // 2)
        cms.join(cms)
        self.assertEqual(INT32_T_MAX, cms.check("this is a test"))
        self.assertEqual(cms.elements_added, too_large)

        cms.add("this is a test 2 ", INT64_T_MAX // 2)
        cms.join(cms)
        self.assertEqual(cms.elements_added, INT64_T_MAX)
    def test_cms_check_min(self):
        """test checking number elements using min algorithm"""
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add("this is a test", 255), 255)
        self.assertEqual(cms.add("this is another test", 189), 189)
        self.assertEqual(cms.add("this is also a test", 16), 16)
        self.assertEqual(cms.add("this is something to test", 5), 5)

        self.assertEqual(cms.check("this is something to test"), 5)
        self.assertEqual(cms.check("this is also a test"), 16)
        self.assertEqual(cms.check("this is another test"), 189)
        self.assertEqual(cms.check("this is a test"), 255)
        self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
    def test_cms_check_min(self):
        ''' test checking number elements using min algorithm '''
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 255), 255)
        self.assertEqual(cms.add('this is another test', 189), 189)
        self.assertEqual(cms.add('this is also a test', 16), 16)
        self.assertEqual(cms.add('this is something to test', 5), 5)

        self.assertEqual(cms.check('this is something to test'), 5)
        self.assertEqual(cms.check('this is also a test'), 16)
        self.assertEqual(cms.check('this is another test'), 189)
        self.assertEqual(cms.check('this is a test'), 255)
        self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
Beispiel #16
0
    def test_cms_check_mean_called(self):
        ''' test checking number elements using mean algorithm called out '''
        cms = CountMinSketch(width=1000, depth=5)
        cms.query_type = 'mean'
        self.assertEqual(cms.add('this is a test', 255), 255)
        self.assertEqual(cms.add('this is another test', 189), 189)
        self.assertEqual(cms.add('this is also a test', 16), 16)
        self.assertEqual(cms.add('this is something to test', 5), 5)

        self.assertEqual(cms.check('this is something to test'), 5)
        self.assertEqual(cms.check('this is also a test'), 16)
        self.assertEqual(cms.check('this is another test'), 189)
        self.assertEqual(cms.check('this is a test'), 255)
        self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
    def test_cms_check_mean_called_even(self):
        ''' test checking number elements using mean algorithm called out when
            the depth is an even number... '''
        cms = CountMinSketch(width=1000, depth=6)
        cms.query_type = 'mean-min'
        self.assertEqual(cms.add('this is a test', 255), 255)
        self.assertEqual(cms.add('this is another test', 189), 189)
        self.assertEqual(cms.add('this is also a test', 16), 16)
        self.assertEqual(cms.add('this is something to test', 5), 5)

        self.assertEqual(cms.check('this is something to test'), 5)
        self.assertEqual(cms.check('this is also a test'), 16)
        self.assertEqual(cms.check('this is another test'), 189)
        self.assertEqual(cms.check('this is a test'), 255)
        self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
    def test_cms_check_mean_called_even(self):
        """test checking number elements using mean algorithm called out when
        the depth is an even number..."""
        cms = CountMinSketch(width=1000, depth=6)
        cms.query_type = "mean-min"
        self.assertEqual(cms.add("this is a test", 255), 255)
        self.assertEqual(cms.add("this is another test", 189), 189)
        self.assertEqual(cms.add("this is also a test", 16), 16)
        self.assertEqual(cms.add("this is something to test", 5), 5)

        self.assertEqual(cms.check("this is something to test"), 5)
        self.assertEqual(cms.check("this is also a test"), 16)
        self.assertEqual(cms.check("this is another test"), 189)
        self.assertEqual(cms.check("this is a test"), 255)
        self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
 def test_cms_remove_mult(self):
     """test the removal of multiple elements at a time"""
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add("this is a test", 16), 16)
     self.assertEqual(cms.elements_added, 16)
     self.assertEqual(cms.remove("this is a test", 4), 12)
     self.assertEqual(cms.elements_added, 12)
 def test_cms_remove_mult(self):
     ''' test the removal of multiple elements at a time '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test', 16), 16)
     self.assertEqual(cms.elements_added, 16)
     self.assertEqual(cms.remove('this is a test', 4), 12)
     self.assertEqual(cms.elements_added, 12)
 def test_cms_remove_single(self):
     ''' test the removal of a single element at a time '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test', 4), 4)
     self.assertEqual(cms.elements_added, 4)
     self.assertEqual(cms.remove('this is a test'), 3)
     self.assertEqual(cms.remove('this is a test'), 2)
     self.assertEqual(cms.elements_added, 2)
Beispiel #22
0
 def test_cms_remove_single(self):
     ''' test the removal of a single element at a time '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test', 4), 4)
     self.assertEqual(cms.elements_added, 4)
     self.assertEqual(cms.remove('this is a test'), 3)
     self.assertEqual(cms.remove('this is a test'), 2)
     self.assertEqual(cms.elements_added, 2)
 def test_cms_remove_single(self):
     """test the removal of a single element at a time"""
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add("this is a test", 4), 4)
     self.assertEqual(cms.elements_added, 4)
     self.assertEqual(cms.remove("this is a test"), 3)
     self.assertEqual(cms.remove("this is a test"), 2)
     self.assertEqual(cms.elements_added, 2)
Beispiel #24
0
    def test_cms_clear(self):
        ''' test the clear functionality '''
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 100), 100)
        self.assertEqual(cms.elements_added, 100)

        cms.clear()
        self.assertEqual(cms.elements_added, 0)
        self.assertEqual(cms.check('this is a test'), 0)
    def test_cms_clear(self):
        """test the clear functionality"""
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add("this is a test", 100), 100)
        self.assertEqual(cms.elements_added, 100)

        cms.clear()
        self.assertEqual(cms.elements_added, 0)
        self.assertEqual(cms.check("this is a test"), 0)
    def test_cms_clear(self):
        ''' test the clear functionality '''
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 100), 100)
        self.assertEqual(cms.elements_added, 100)

        cms.clear()
        self.assertEqual(cms.elements_added, 0)
        self.assertEqual(cms.check('this is a test'), 0)
Beispiel #27
0
 def test_cms_str(self):
     ''' test the string representation of the count-min sketch '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test', 100), 100)
     msg = ('Count-Min Sketch:\n'
            '\tWidth: 1000\n'
            '\tDepth: 5\n'
            '\tConfidence: 0.96875\n'
            '\tError Rate: 0.002\n'
            '\tElements Added: 100')
     self.assertEqual(str(cms), msg)
 def test_cms_str(self):
     ''' test the string representation of the count-min sketch '''
     cms = CountMinSketch(width=1000, depth=5)
     self.assertEqual(cms.add('this is a test', 100), 100)
     msg = ('Count-Min Sketch:\n'
            '\tWidth: 1000\n'
            '\tDepth: 5\n'
            '\tConfidence: 0.96875\n'
            '\tError Rate: 0.002\n'
            '\tElements Added: 100')
     self.assertEqual(str(cms), msg)
Beispiel #29
0
class CM4:
    def __init__(self, width=128):
        if width < 1:
            raise RuntimeError("bad width for cm4")
        self.cm4 = CountMinSketch(width, 4)
        self.keys = set()

    def add(self, key: str):
        self.cm4.add(key)
        self.keys.add(key)

    def estimate(self, key: str):
        return self.cm4.check(key)

    def reset(self):
        for key in self.keys.copy():
            down = self.cm4.check(key) >> 1 & 9223372036854775807
            # if down > 1, it will be half of the count
            if down == 0:
                down = 1
                self.keys.discard(key)
            self.cm4.remove(key, down)
    def test_cms_load(self):
        """test loading a count-min sketch from file"""
        md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
            cms = CountMinSketch(width=1000, depth=5)
            self.assertEqual(cms.add("this is a test", 100), 100)
            cms.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)

            # try loading directly to file!
            cms2 = CountMinSketch(filepath=fobj.name)
            self.assertEqual(cms2.elements_added, 100)
            self.assertEqual(cms2.check("this is a test"), 100)
Beispiel #31
0
    def test_cms_load(self):
        ''' test loading a count-min sketch from file '''
        md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449'
        filename = 'test.cms'
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 100), 100)
        cms.export(filename)
        md5_out = calc_file_md5(filename)
        self.assertEqual(md5_out, md5_val)

        # try loading directly to file!
        cms2 = CountMinSketch(filepath=filename)
        self.assertEqual(cms2.elements_added, 100)
        self.assertEqual(cms2.check('this is a test'), 100)
        os.remove(filename)
    def test_cms_load(self):
        ''' test loading a count-min sketch from file '''
        md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449'
        filename = 'test.cms'
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 100), 100)
        cms.export(filename)
        md5_out = calc_file_md5(filename)
        self.assertEqual(md5_out, md5_val)

        # try loading directly to file!
        cms2 = CountMinSketch(filepath=filename)
        self.assertEqual(cms2.elements_added, 100)
        self.assertEqual(cms2.check('this is a test'), 100)
        os.remove(filename)
    def test_cms_load_diff_hash(self):
        """test loading a count-min sketch from file"""
        md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e"
        with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj:
            cms = CountMinSketch(width=1000, depth=5)
            self.assertEqual(cms.add("this is a test", 100), 100)
            cms.export(fobj.name)
            md5_out = calc_file_md5(fobj.name)
            self.assertEqual(md5_out, md5_val)

            cms2 = CountMinSketch(filepath=fobj.name, hash_function=different_hash)
            self.assertEqual(cms2.elements_added, 100)
            # should not work since it is a different hash
            self.assertNotEqual(cms.check("this is a test"), True)
            self.assertNotEqual(cms.hashes("this is a test"), cms2.hashes("this is a test"))
Beispiel #34
0
    def test_cms_load_diff_hash(self):
        ''' test loading a count-min sketch from file '''
        md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449'
        filename = 'test.cms'
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 100), 100)
        cms.export(filename)
        md5_out = calc_file_md5(filename)
        self.assertEqual(md5_out, md5_val)

        cms2 = CountMinSketch(filepath=filename, hash_function=different_hash)
        self.assertEqual(cms2.elements_added, 100)
        # should not work since it is a different hash
        self.assertNotEqual(cms.check('this is a test'), True)
        self.assertNotEqual(cms.hashes('this is a test'),
                            cms2.hashes('this is a test'))
        os.remove(filename)
    def test_cms_load_diff_hash(self):
        ''' test loading a count-min sketch from file '''
        md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449'
        filename = 'test.cms'
        cms = CountMinSketch(width=1000, depth=5)
        self.assertEqual(cms.add('this is a test', 100), 100)
        cms.export(filename)
        md5_out = calc_file_md5(filename)
        self.assertEqual(md5_out, md5_val)

        cms2 = CountMinSketch(filepath=filename, hash_function=different_hash)
        self.assertEqual(cms2.elements_added, 100)
        # should not work since it is a different hash
        self.assertNotEqual(cms.check('this is a test'), True)
        self.assertNotEqual(cms.hashes('this is a test'),
                            cms2.hashes('this is a test'))
        os.remove(filename)
    def test_cms_join(self):
        """test standard count-min sketch join"""
        cms1 = CountMinSketch(width=1000, depth=5)
        cms2 = CountMinSketch(width=1000, depth=5)

        self.assertEqual(255, cms1.add("this is a test", 255))
        self.assertEqual(189, cms1.add("this is another test", 189))
        self.assertEqual(16, cms1.add("this is also a test", 16))
        self.assertEqual(5, cms1.add("this is something to test", 5))

        self.assertEqual(255, cms2.add("this is a test", 255))
        self.assertEqual(189, cms2.add("this is another test", 189))
        self.assertEqual(16, cms2.add("this is also a test", 16))
        self.assertEqual(5, cms2.add("this is something to test", 5))

        cms1.join(cms2)
        self.assertEqual(255 * 2, cms1.check("this is a test"))
        self.assertEqual(189 * 2, cms1.check("this is another test"))
        self.assertEqual(16 * 2, cms1.check("this is also a test"))
        self.assertEqual(5 * 2, cms1.check("this is something to test"))
with open('data-streaming-project.data') as f:
    rows_number=sum(1 for line in f)

data_df=pd.read_csv('data-streaming-project.data',encoding='utf-8',delimiter='\t',names=['user','movie', 'rating', 'timestamp'],header=None)
df_movies=data_df['movie']    



cms=CountMinSketch(width=200,depth=21)

start_time2 = time.time()


counter = 0
for i in range(0,1126747):
    counter+=1
    cms.add(str(df_movies[i]))
    if counter%1000 == 0:
        print(sys.getsizeof(cms))

end_time2 = time.time()

cms.check('592')



print(sys.getsizeof(cms))

print("Total execution time: {}".format(end_time2 - start_time2))

            cms_filename = proj_dir + "cms_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_freq.cms"

            stock_symbol_dist = {}
            stock_freq_cms = CountMinSketch(width, depth)
            stock_trade_record_count = 0
            sketch_time = 0
            sketch_qrytime = 0
            total_accuracy = 0

            for stock_trade_line in stock_trade_lines:
                stock_trade_record_count = stock_trade_record_count + 1
                if stock_trade_record_count > no_of_record: break
                stock_symbol = stock_trade_line[0].strip()

                sketch_starttime = time.process_time()
                add1 = stock_freq_cms.add(stock_symbol)
                sketch_endtime = time.process_time()
                sketch_time = sketch_time + (sketch_endtime - sketch_starttime)

                if stock_symbol in stock_symbol_dist.keys():
                    stock_symbol_freq = stock_symbol_dist[stock_symbol] + 1
                else:
                    stock_symbol_freq = 1

                stock_symbol_dist.update({stock_symbol: stock_symbol_freq})

            for stock_symbol in list(stock_symbol_dist.keys()):
                sketch_starttime = time.process_time()
                stock_symbol_freq_cms = stock_freq_cms.check(stock_symbol)
                sketch_endtime = time.process_time()
                sketch_qrytime = sketch_qrytime + (sketch_endtime - sketch_starttime)
Beispiel #39
0
    while n < len(dfStrPr):
        dfStrPrWin = dfStrPr[start: end]
#        print("start:", start)
#        print("end:", end)
#        print(dfStrPrWin)
        for m , row in dfStrPrWin.iterrows():
            mov = dfStrPrWin.loc[m, "movie"]
            #BQ1_1 movie frq counter
            if mov in movsFrq.index:
                movsFrq.loc[mov]['Frq'] = movsFrq.loc[mov]['Frq'] + 1
            else:
                movsFrq.loc[mov] = [1]
            #BQ1_2 min-sketch
            if BQ1_2 == 1:
                mov_s = str(mov)
                cms.add(mov_s)
                movsFrqSkt.loc[mov] = cms.check(mov_s)
                
        start = start + strmStep
        end = end  + 1000 
        n =n + strmStep
    movsFrq.to_csv(storeResults + "movie-counter.csv")
    
    #BQ1_3 Compare method accuracy
    if BQ1_3 == 1:
        movsFrq = movsFrq.sort_values('Frq', ascending=False)
        movsFrqSkt = movsFrqSkt.sort_values('Frq', ascending=False)
 
        rmse = mean_squared_error(movsFrq, movsFrqSkt)
        print("BQ1_3 Compare method accuracy, RMSE: ",  rmse)
Beispiel #40
0
    return reservoir


#%%
freq_data = top10freq(ip_data)

# try different sizes
sampled_data1 = reservoir_sampling(ip_data, 30000)
freq_data1 = top10freq(sampled_data1)
sampled_data2 = reservoir_sampling(ip_data, 100000)
freq_data2 = top10freq(sampled_data2)
sampled_data3 = reservoir_sampling(ip_data, 500000)
freq_data3 = top10freq(sampled_data3)
sampled_data4 = reservoir_sampling(ip_data, 1000000)
freq_data4 = top10freq(sampled_data4)
sampled_data5 = reservoir_sampling(ip_data, 2000000)
freq_data5 = top10freq(sampled_data5)

#%% CMS
import time
start_time = time.time()  # count the compute time
from probables import (CountMinSketch)
cms = CountMinSketch(width=500, depth=100)
cms.clear
for i in ip_data:
    cms.add(i)
time_spend = time.time() - start_time
#
cms.error_rate
cms.elements_added
Beispiel #41
0
class makerCache:
    def __init__(self, size):
        self.phase = 1
        self.round = 1
        self.size = size
        self.cache = []
        self.clean_counter = 0
        self.clean_set = []
        for i in range(0, size):  # init empty cache(list)
            self.cache.append(Node('-', RequestFile(0, 'txt', False)))
        self.model = train_model()
        self.cms = CountMinSketch(width=1000, depth=5)
        self.hashtable = {}  # single bucket for heavy items
        self.miss_count = 0

    def print(self):
        for i in range(0, self.size):
            print(self.cache[i].page_name, end=' ')
        print(' ')
        print(' ')

    def get_phase(self):
        return self.phase

    def isAllSlotTaken(self):
        for i in range(0, self.size):
            if self.cache[i].page_name == '-':
                return False
        return True

    def lookup(self, page_name):
        for i in range(0, self.size):
            if self.cache[i].page_name == page_name:
                return True  # page in the cache
        return False  # page not in cache

    def set_marked(self, page_name):
        for i in range(0, self.size):
            if self.cache[i].page_name == page_name:
                self.cache[i].makred = True

    def is_all_marked(self):
        for i in range(0, self.size):
            if self.cache[i].marked == False:
                return False
        return True

    def reset(self):
        for i in range(0, self.size):
            self.cache[i].marked = False
        self.phase = self.phase + 1
        self.clean_counter = 0
        # save current cache as the set of elements that are possibly stale in new phase

    def evict(self, slot_pos):
        self.cache[slot_pos].page_name = '-'
        self.cache[slot_pos].request_file.update_year = 0
        self.cache[slot_pos].request_file.file_type = '-'
        self.cache[slot_pos].request_file.is_in_hompage = False
        self.cache[slot_pos].marked = False

    def fill_in(self, slot_pos, new_page_name, request_file):
        self.cache[slot_pos].page_name = new_page_name
        self.cache[
            slot_pos].request_file.update_year = request_file.update_year
        self.cache[slot_pos].request_file.file_type = request_file.file_type
        self.cache[
            slot_pos].request_file.is_in_hompage = request_file.is_in_homepage
        self.cache[slot_pos].marked = True
        file_df = file_to_dataframe(self.cache[slot_pos].request_file)
        is_heavy0 = is_heavy(self.model, file_df)
        if is_heavy0:
            if new_page_name in self.hashtable:
                self.hashtable[
                    new_page_name] = self.hashtable[new_page_name] + 1
            else:
                self.hashtable[new_page_name] = 1
        else:
            self.cms.add(new_page_name)

    def replace(self, slot_pos, new_page_name, request_file):
        self.evict(slot_pos)
        self.fill_in(slot_pos, new_page_name, request_file)

    def select_unmarked(self, mode):
        pos = 0
        unmarked = []
        unmarked_freq = []

        for i in range(0, self.size):
            if self.cache[i].marked == False:
                print(self.cache[i])
                unmarked.append(i)
                unmarked_freq.append(0)

        print('unmarked:', unmarked)
        # rand method
        if mode == 'random':
            # print('using random method to select unmarked')
            rand_pos = random.randint(0, len(unmarked) - 1)
            print('unmarked len =', len(unmarked))
            print('rand_pos=', rand_pos)
            print('real pos =', unmarked[rand_pos])
            return unmarked[rand_pos]

        elif mode == 'ml_oracle':
            # ml method
            # predict all unmarked element (predict time = frequency from count-min sketch)
            # select the lowest frequency unmarked element
            # print('using ml oracle method to select unmarked')
            for i in range(0, len(unmarked)):

                freq = 0
                # data preprocess
                file_df = file_to_dataframe(
                    self.cache[unmarked[i]].request_file)
                is_heavy0 = is_heavy(self.model, file_df)

                if is_heavy0:
                    freq = self.hashtable[self.cache[unmarked[i]].page_name]
                    # print('freq:', freq)
                    unmarked_freq[i] = freq

                    # get from hash table
                else:
                    # not heavy : get from count min sketch
                    freq = self.cms.check(self.cache[unmarked[i]].page_name)
                    # print('freq:', freq)
                    unmarked_freq[i] = freq
            print(unmarked)
            print(unmarked_freq)
            max = 0
            max_pos = 0
            for j in range(0, len(unmarked)):
                if unmarked[j] > max:
                    max = unmarked[j]
                    max_pos = j
            return max_pos

    def request_page(self, page_name, request_file, mode='random'):
        # print('mode = ', mode)
        # requested page in cache
        if self.lookup(page_name) == True:
            self.set_marked(page_name)  # set its marked = true
            file_df = file_to_dataframe(request_file)
            is_heavy0 = is_heavy(self.model, file_df)
            if is_heavy0:
                if page_name in self.hashtable:
                    self.hashtable[page_name] = self.hashtable[page_name] + 1
                    print('freq:', self.hashtable[page_name])
                else:
                    self.hashtable[page_name] = 1
            else:
                self.cms.add(page_name)
                print('freq:', self.cms.check(page_name))

            if self.is_all_marked() == True:
                print('cache is all marked, ready to reset')
                self.print()
                self.reset()
            return True

        # page not in cache, idle slot available
        # a miss occur
        self.miss_count += 1

        if self.isAllSlotTaken() == False:
            for i in range(0, self.size):
                if self.cache[i].page_name == '-':
                    self.replace(i, page_name, request_file)
                    break
            if self.is_all_marked() == True:
                print('cache is all marked, ready to reset')
                self.print()
                self.reset()
            return False

        # page not in cache and no idle slot
        replace_pos = self.select_unmarked(mode)
        self.replace(replace_pos, page_name, request_file)
        if self.is_all_marked() == True:
            print('cache is all marked, ready to reset')
            self.print()
            self.reset()
        return False
Beispiel #42
0
def get_withtime_files(path, filter_threshold):
    raw_data_set = collections.defaultdict(list)
    count = 0
    prefetching_list ={}
    ID_stack =[]
    for dir in path:
        dir_list = os.listdir(dir)
        time_stack =[]
        for filename in dir_list:
            data_path = dir+filename
            try:
                fp = open(data_path)
                features = fp.readlines()
                time_slide = []
                last_same = link_pure_maker(features[0].split(' ')[3].split('/'))
                for num in range(len(features)):
                    vec = features[num].split(' ')
                    #delete the same pattern
                    if vec[3][-4:-1] in IGNORE_PATTERN:
                        pass
                    else:
                        buff_link_list = vec[3].split('/')
                        link_cat = link_pure_maker(buff_link_list)
                        if num == 0 or link_cat != last_same:
                            last_same = link_cat
                            tarray = time.localtime(int(vec[1]))
                            # time_str_buff= str(tarray.tm_mon)+str(tarray.tm_mday)+str(tarray.tm_hour)+str(tarray.tm_min)+str(tarray.tm_sec)
                            # time_str_buff= str(tarray.tm_hour)+' '+str(tarray.tm_min)
                            time_str_buff=int((tarray.tm_hour*360+tarray.tm_min*60+tarray.tm_sec)/10)
                            # time_str_buff=int((tarray.tm_hour*60+tarray.tm_min)/8)
                            try:
                                if int(vec[4])>0:
                                    raw_data_set[time_str_buff].append([str(filename.split('.')[0][3:]),link_cat,vec[4]])
                            except:
                                pass
                            ID_stack.append(int(str(filename.split('.')[0][3:])))
                            #[link, user_ID, time_hour]
                            try:
                                if int(vec[4]) > filter_threshold:
                                    if link_cat in prefetching_list:
                                        if vec[4]>prefetching_list[link_cat]:
                                            prefetching_list[link_cat]=vec[4]
                                    else:
                                        prefetching_list[link_cat]=vec[4]
                            except:
                                pass
                        else:
                            pass
            except:
                count+=1

#monitoring
    time_list = []
    for key, cont in raw_data_set.items():
        capacity = 0
        cms = CountMinSketch(width=1000, depth=5)
        bloom = BloomFilter(max_elements=10000, error_rate=0.1)
        for i in raw_data_set[key]:
            bloom.add(i[1])
            cms.add(i[1])
        type_count=0
        amount =0
        for i,l in prefetching_list.items():
            if i in bloom:
                capacity+=int(l)*100
                amount+=1
                type_count+= cms.check(i)
        if type_count==0:
            rep_eta = 0
        else:
            rep_eta = amount/type_count
        # print(str(rep_eta)[:5])
        time_list.append([key,capacity,rep_eta])

    time_list.sort()
    a_list =[]
    for i in range(1,len(time_list)):
        a_list.append([time_list[i][0],time_list[i-1][1]-time_list[i][1],time_list[i][1],time_list[i][2]])
    trigger_list=[]
    for i in a_list:
        if abs(i[1])>250000000:
            trigger_list.append([i[0],i[1]/i[2],i[2],i[3]])
    obj_save(trigger_list, 'trigger_list_dir.txt')
    obj_save(time_list, 'time_list_dir.txt')
    print('激活点位', len(trigger_list))
    print('总时间点位', len(time_list))

    plt.title('   ')
    plt.xlabel('timestamp')
    plt.xticks(rotation=45)
    plt.ylabel('Loads')
    plt.plot([i[0] for i in time_list], [i[1] for i in time_list],'-',color='b',label='Prefetching Loads')
    plt.plot([i[0] for i in a_list], [i[1] for i in a_list],'-',color='r',label='fluctuation')
    plt.legend()
    plt.grid()
    plt.show()
    print(len(raw_data_set))
    user_id = np.array(ID_stack)
    user_scale =np.max(user_id)+1
    print('irregular_format:',count)
    print('Num of users: ',user_scale)
    return raw_data_set, len(raw_data_set), prefetching_list, user_scale