def test_cms_remove_mult(self): """test the removal of multiple elements at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 16), 16) self.assertEqual(cms.elements_added, 16) self.assertEqual(cms.remove("this is a test", 4), 12) self.assertEqual(cms.elements_added, 12)
def test_cms_bytes(self): """test exporting a count-min sketch as bytes""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) md5_out = hashlib.md5(bytes(cms)).hexdigest() self.assertEqual(md5_out, md5_val)
def test_cms_remove_mult(self): ''' test the removal of multiple elements at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 16), 16) self.assertEqual(cms.elements_added, 16) self.assertEqual(cms.remove('this is a test', 4), 12) self.assertEqual(cms.elements_added, 12)
def test_cms_remove_single(self): ''' test the removal of a single element at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 4), 4) self.assertEqual(cms.elements_added, 4) self.assertEqual(cms.remove('this is a test'), 3) self.assertEqual(cms.remove('this is a test'), 2) self.assertEqual(cms.elements_added, 2)
def test_cms_min_val(self): ''' test when we come to the bottom of the 32 bit int (stop overflow) ''' too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.remove('this is a test', too_large) self.assertEqual(cms.check('this is a test'), INT32_T_MIN) self.assertEqual(cms.elements_added, INT64_T_MIN)
def test_cms_add_mult(self): """test the insertion of multiple elements at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 4), 4) self.assertEqual(cms.add("this is a test", 4), 8) self.assertEqual(cms.add("this is a test", 4), 12) self.assertEqual(cms.add("this is a test", 4), 16) self.assertEqual(cms.elements_added, 16)
def test_cms_remove_single(self): """test the removal of a single element at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 4), 4) self.assertEqual(cms.elements_added, 4) self.assertEqual(cms.remove("this is a test"), 3) self.assertEqual(cms.remove("this is a test"), 2) self.assertEqual(cms.elements_added, 2)
def test_cms_add_mult(self): ''' test the insertion of multiple elements at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 4), 4) self.assertEqual(cms.add('this is a test', 4), 8) self.assertEqual(cms.add('this is a test', 4), 12) self.assertEqual(cms.add('this is a test', 4), 16) self.assertEqual(cms.elements_added, 16)
def test_cms_add_single(self): ''' test the insertion of a single element at a time ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test'), 1) self.assertEqual(cms.add('this is a test'), 2) self.assertEqual(cms.add('this is a test'), 3) self.assertEqual(cms.add('this is a test'), 4) self.assertEqual(cms.elements_added, 4)
def test_cms_max_val(self): ''' test when we come to the top of the 32 bit int (stop overflow) ''' too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add('this is a test', too_large) self.assertEqual(cms.check('this is a test'), INT32_T_MAX) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_add_single(self): """test the insertion of a single element at a time""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test"), 1) self.assertEqual(cms.add("this is a test"), 2) self.assertEqual(cms.add("this is a test"), 3) self.assertEqual(cms.add("this is a test"), 4) self.assertEqual(cms.elements_added, 4)
def test_cms_max_val(self): """test when we come to the top of the 32 bit int (stop overflow)""" too_large = INT64_T_MAX + 5 cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", too_large) self.assertEqual(cms.check("this is a test"), INT32_T_MAX) self.assertEqual(cms.elements_added, INT64_T_MAX)
def test_cms_clear(self): ''' test the clear functionality ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) self.assertEqual(cms.elements_added, 100) cms.clear() self.assertEqual(cms.elements_added, 0) self.assertEqual(cms.check('this is a test'), 0)
def test_cms_different_hash(self): ''' test using a different hash function ''' cms = CountMinSketch(width=1000, depth=5) hashes1 = cms.hashes('this is a test') cms2 = CountMinSketch(width=1000, depth=5, hash_function=different_hash) hashes2 = cms2.hashes('this is a test') self.assertNotEqual(hashes1, hashes2)
def test_cms_set_query_type(self): """test setting different query types""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.query_type, "min") cms.query_type = "mean-min" self.assertEqual(cms.query_type, "mean-min") cms.query_type = "mean" self.assertEqual(cms.query_type, "mean") cms.query_type = "unknown" self.assertEqual(cms.query_type, "min")
def test_cms_set_query_type(self): ''' test setting different query types ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.query_type, 'min') cms.query_type = 'mean-min' self.assertEqual(cms.query_type, 'mean-min') cms.query_type = 'mean' self.assertEqual(cms.query_type, 'mean') cms.query_type = 'unknown' self.assertEqual(cms.query_type, 'min')
def test_cms_join_invalid(self): """test joing a cms with an invalid type""" cms = CountMinSketch(width=1000, depth=5) try: cms.join(1) except TypeError as ex: msg = "Unable to merge a count-min sketch with {}".format("<class 'int'>") self.assertEqual(str(ex), msg) else: self.assertEqual(True, False)
def test_cms_str(self): ''' test the string representation of the count-min sketch ''' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) msg = ('Count-Min Sketch:\n' '\tWidth: 1000\n' '\tDepth: 5\n' '\tConfidence: 0.96875\n' '\tError Rate: 0.002\n' '\tElements Added: 100') self.assertEqual(str(cms), msg)
def test_cms_export(self): ''' test exporting a count-min sketch ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) cms.add('this is a test', 100) cms.export(filename) md5_out = calc_file_md5(filename) os.remove(filename) self.assertEqual(md5_out, md5_val)
def test_cms_frombytes(self): """test loading a count-min sketch from bytes""" cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) bytes_out = bytes(cms) cms2 = CountMinSketch.frombytes(bytes_out) self.assertEqual(bytes(cms2), bytes(cms)) self.assertEqual(cms2.width, 1000) self.assertEqual(cms2.depth, 5) self.assertEqual(cms2.check("this is a test"), 100)
def test_cms_join_mismatch_depth(self): """test joining cms with mismatch depth""" cms1 = CountMinSketch(width=1000, depth=5) cms2 = CountMinSketch(width=1000, depth=4) try: cms1.join(cms2) except CountMinSketchError as ex: msg = "Unable to merge as the count-min sketches are mismatched" self.assertEqual(ex.message, msg) else: self.assertEqual(True, False)
def test_cms_check_min(self): """test checking number elements using min algorithm""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 255), 255) self.assertEqual(cms.add("this is another test", 189), 189) self.assertEqual(cms.add("this is also a test", 16), 16) self.assertEqual(cms.add("this is something to test", 5), 5) self.assertEqual(cms.check("this is something to test"), 5) self.assertEqual(cms.check("this is also a test"), 16) self.assertEqual(cms.check("this is another test"), 189) self.assertEqual(cms.check("this is a test"), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def __init__(self, size): self.phase = 1 self.round = 1 self.size = size self.cache = [] self.clean_counter = 0 self.clean_set = [] for i in range(0, size): # init empty cache(list) self.cache.append(Node('-', RequestFile(0, 'txt', False))) self.model = train_model() self.cms = CountMinSketch(width=1000, depth=5) self.hashtable = {} # single bucket for heavy items self.miss_count = 0
def test_cms_check_mean_called(self): ''' test checking number elements using mean algorithm called out ''' cms = CountMinSketch(width=1000, depth=5) cms.query_type = 'mean' self.assertEqual(cms.add('this is a test', 255), 255) self.assertEqual(cms.add('this is another test', 189), 189) self.assertEqual(cms.add('this is also a test', 16), 16) self.assertEqual(cms.add('this is something to test', 5), 5) self.assertEqual(cms.check('this is something to test'), 5) self.assertEqual(cms.check('this is also a test'), 16) self.assertEqual(cms.check('this is another test'), 189) self.assertEqual(cms.check('this is a test'), 255) self.assertEqual(cms.elements_added, 5 + 16 + 189 + 255)
def test_cms_load_diff_hash(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=fobj.name, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check("this is a test"), True) self.assertNotEqual(cms.hashes("this is a test"), cms2.hashes("this is a test"))
def test_cms_load_diff_hash(self): ''' test loading a count-min sketch from file ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) cms.export(filename) md5_out = calc_file_md5(filename) self.assertEqual(md5_out, md5_val) cms2 = CountMinSketch(filepath=filename, hash_function=different_hash) self.assertEqual(cms2.elements_added, 100) # should not work since it is a different hash self.assertNotEqual(cms.check('this is a test'), True) self.assertNotEqual(cms.hashes('this is a test'), cms2.hashes('this is a test')) os.remove(filename)
def test_cms_mismatch_hash_function(self): """test joining when hash functions do not match""" cms1 = CountMinSketch(width=1000, depth=5) cms2 = CountMinSketch(width=1000, depth=5, hash_function=different_hash) def runner(): """runner""" cms1.join(cms2) self.assertRaises(CountMinSketchError, runner) try: cms1.join(cms2) except CountMinSketchError as ex: msg = "Unable to merge as the count-min sketches are mismatched" self.assertEqual(ex.message, msg) else: self.assertEqual(True, False)
def test_cms_init_wd(self): """Test count-min sketch initialization using depth and width""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.width, 1000) self.assertEqual(cms.depth, 5) self.assertEqual(cms.confidence, 0.96875) self.assertEqual(cms.error_rate, 0.002) self.assertEqual(cms.elements_added, 0)
def test_cms_init_ce(self): """Test count-min sketch initialization using confidence and error rate""" cms = CountMinSketch(confidence=0.96875, error_rate=0.002) self.assertEqual(cms.width, 1000) self.assertEqual(cms.depth, 5) self.assertEqual(cms.confidence, 0.96875) self.assertEqual(cms.error_rate, 0.002) self.assertEqual(cms.elements_added, 0)
def test_cms_load(self): """test loading a count-min sketch from file""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=fobj.name) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check("this is a test"), 100)
def test_cms_join_mixed_types(self): """test count-min, count-mean, and count-meanmin joining""" cms = CountMinSketch(width=1000, depth=5) cmeans = CountMeanSketch(width=1000, depth=5) cmms = CountMeanMinSketch(width=1000, depth=5) cms.add("this is a test", 500) cmeans.add("this is another test", 500) cmms.add("this is yet another test", 500) cms.join(cmeans) self.assertTrue("this is a test" in cms) self.assertTrue("this is another test" in cms) self.assertFalse("this is yet another test" in cms) cmeans.join(cmms) self.assertFalse("this is a test" in cmeans) self.assertTrue("this is another test" in cmeans) self.assertTrue("this is yet another test" in cmeans) self.assertFalse("foobar" in cmeans) cmms.join(cms) self.assertTrue("this is a test" in cmms) self.assertTrue("this is another test" in cmms) self.assertTrue("this is yet another test" in cmms) self.assertFalse("this is yet another test!" in cmms)
def test_cms_init_error_msg(self): ''' Test count-min sketch initialization without enough params ''' try: CountMinSketch(width=1000) except InitializationError as ex: msg = ('Must provide one of the following to initialize the ' 'Count-Min Sketch:\n' ' A file to load,\n' ' The width and depth,\n' ' OR confidence and error rate') self.assertEqual(str(ex), msg) else: self.assertEqual(True, False)
def test_cms_different_hash(self): """test using a different hash function""" cms = CountMinSketch(width=1000, depth=5) hashes1 = cms.hashes("this is a test") cms2 = CountMinSketch(width=1000, depth=5, hash_function=different_hash) hashes2 = cms2.hashes("this is a test") self.assertNotEqual(hashes1, hashes2)
def test_cms_load(self): ''' test loading a count-min sketch from file ''' md5_val = '61d2ea9d0cb09b7bb284e1cf1a860449' filename = 'test.cms' cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add('this is a test', 100), 100) cms.export(filename) md5_out = calc_file_md5(filename) self.assertEqual(md5_out, md5_val) # try loading directly to file! cms2 = CountMinSketch(filepath=filename) self.assertEqual(cms2.elements_added, 100) self.assertEqual(cms2.check('this is a test'), 100) os.remove(filename)
def test_cms_export(self): """test exporting a count-min sketch""" md5_val = "fb1c39dd1a73f1ef0d7fc79f60fc028e" with NamedTemporaryFile(dir=os.getcwd(), suffix=".cms", delete=DELETE_TEMP_FILES) as fobj: cms = CountMinSketch(width=1000, depth=5) cms.add("this is a test", 100) cms.export(fobj.name) md5_out = calc_file_md5(fobj.name) self.assertEqual(md5_out, md5_val)
def test_cms_clear(self): """test the clear functionality""" cms = CountMinSketch(width=1000, depth=5) self.assertEqual(cms.add("this is a test", 100), 100) self.assertEqual(cms.elements_added, 100) cms.clear() self.assertEqual(cms.elements_added, 0) self.assertEqual(cms.check("this is a test"), 0)