def test_update_no_collisions(self):
        """
        Tests the results of different algorithms with parameters chosen so that no hash collision affects results
        """
        width = 2 ** 17
        cardinality = width // 4
        cms = CountMinSketch(width=width, depth=8, log_counting=self.log_counting)
        cms.update(generateData(cardinality))
        expected = Counter()
        expected.update(generateData(cardinality))

        bias, deviation, max_log_error, avg_log_error, max_d_error, max_error_expected = stats(cms, expected)

        self.assertAlmostEqual(
            max_log_error, 0,
            msg="Each result should be within maximum tolerance",
            delta=self.max_log_tolerance)
        self.assertAlmostEqual(
            avg_log_error, 0,
            msg="Average log deviation should be low",
            delta=self.avg_log_tolerance)
        self.assertAlmostEqual(
            bias, 0,
            msg="Total bias should be low",
            delta=self.total_bias_tolerance)
Example #2
0
 def test_depth_alg_init(self):
     data_set = [(None, 4, 2**18, 4), (None, 17, 2**19, 8),
                 (1024, 70, 2**21, 17), (8, 100, 2**26, 1)]
     for (log_counting, size_mb, exp_width, depth) in data_set:
         cms = CountMinSketch(size_mb=size_mb,
                              depth=depth,
                              log_counting=log_counting)
         self.assertEqual(cms.width, exp_width)
         self.assertEqual(cms.depth, depth)
         self.assertLessEqual(cms.size(), size_mb * 1024 * 1024)
Example #3
0
 def test_width_depth_alg_init(self):
     data_set = [(None, 2**12, 3, 49152), (None, 2**13, 7, 229376),
                 (1024, 2**18, 1, 524288), (8, 2**13, 7, 57344)]
     for (log_counting, width, depth, size) in data_set:
         cms = CountMinSketch(width=width,
                              depth=depth,
                              log_counting=log_counting)
         self.assertEqual(cms.width, width)
         self.assertEqual(cms.depth, depth)
         self.assertLessEqual(cms.size(), size)
class CountMinSketchQualityCommonTest(unittest.TestCase):
    def __init__(self, methodName='runTest', log_counting=None):
        self.log_counting = log_counting
        super(CountMinSketchQualityCommonTest, self).__init__(methodName=methodName)

    """
    Functional tests for CountMinSketch.quality method, which returns quality rating of the structure
    """

    def setUp(self):
        self.cms = CountMinSketch(1, log_counting=self.log_counting)

    def test_quality_default(self):
        """
        Uses the default structure
        """
        self.assertEqual(self.cms.quality(), 0)

        three_quarters = int((self.cms.width * 3) / 4)
        for i in range(three_quarters):
            self.cms.increment(str(i), 1 + (i % 13))

        self.assertGreaterEqual(self.cms.quality(), 0.5)
        self.assertLessEqual(self.cms.quality(), 1.0)

        for i in range(three_quarters * 7):
            self.cms.increment(str(i), 1 + (i % 13))

        self.assertGreaterEqual(self.cms.quality(), 4.0)
        self.assertLessEqual(self.cms.quality(), 6.0)
Example #5
0
    def test_invalid_width(self):
        CountMinSketch(size_mb=8, width=2**20)

        with self.assertRaises(ValueError):
            CountMinSketch(size_mb=8,
                           width=2**20 - 1)  # width must be a power of 2!

        with self.assertRaises(ValueError):
            CountMinSketch(size_mb=8, width=2**22)  # width too large!

        with self.assertRaises(ValueError):
            CountMinSketch(width=2**22 - 1,
                           depth=8)  # width must be a power of 2!
Example #6
0
    def size_check(self, log_counting=None, width_adjustment=1):
        data_set = [(1, 2**15, 8), (2, 2**16, 8), (3, 2**16, 12),
                    (4, 2**17, 8), (5, 2**17, 10), (6, 2**17, 12),
                    (7, 2**17, 14), (8, 2**18, 8), (32, 2**20, 8),
                    (55, 2**20, 13), (95, 2**21, 11), (256, 2**23, 8)]

        for (size_mb, width, depth) in data_set:
            cms = CountMinSketch(size_mb, log_counting=log_counting)
            self.assertEqual(cms.width, width * width_adjustment,
                             "Width for size %d" % size_mb)
            self.assertEqual(cms.depth, depth, "Depth for size %d" % size_mb)
            self.assertLessEqual(cms.size(), size_mb * 1024 * 1024)
            self.assertGreater(cms.size(), size_mb * 1024 * 1024 / 2)
Example #7
0
    def test_update_with_cms(self):
        """
        Update with a dictionary and test against it using set representation
        The log variants are only precise up to 2048 (16), so we don't use larger values here
        """
        data1 = {'a': 1, 'b': 3, 'c': 2, 'd': 5}
        data2 = {'a': 15, 'b': 4, 'c': 6, 'e': 13}
        expected = {'a': 16, 'b': 7, 'c': 8, 'd': 5, 'e': 13}

        self.cms.update(data1)
        cms2 = CountMinSketch(1, log_counting=self.log_counting)
        cms2.update(data2)
        self.cms.update(cms2)

        result_set = set()
        for key, expected_value in expected.items():
            result_set.add((key, self.cms[key]))

        self.assertEqual(set(result_set), set(expected.items()))
Example #8
0
 def setUp(self):
     self.cms = CountMinSketch(1, log_counting=self.log_counting)
Example #9
0
class CountMinSketchUpdateCommonTest(unittest.TestCase):
    def __init__(self, methodName='runTest', log_counting=None):
        self.log_counting = log_counting
        super(CountMinSketchUpdateCommonTest,
              self).__init__(methodName=methodName)

    """
    Functional tests for CountMinSketch.update method, which adds another counter, dictionary, hashtable, tuple or list
    """

    def setUp(self):
        self.cms = CountMinSketch(1, log_counting=self.log_counting)

    def test_update_numbers(self):
        """
        Negative test: calling update using numeric values as parameter yields TypeError
        """
        with self.assertRaises(TypeError):
            self.cms.update(1)

        with self.assertRaises(TypeError):
            self.cms.update(1.0)

    def test_update_string(self):
        self.cms.update("foo")
        self.assertEqual(self.cms['f'], 1)
        self.assertEqual(self.cms['o'], 2)

    def test_update_tuple(self):
        tuple = ('foo', 'bar', 'foo')
        self.cms.update(tuple)
        self.assertEqual(self.cms['foo'], 2)
        self.assertEqual(self.cms['bar'], 1)

    def test_update_bytes(self):
        tuple = (b'foo', b'bar', b'foo')
        self.cms.update(tuple)
        self.assertEqual(self.cms['foo'], 2)
        self.assertEqual(self.cms[b'foo'], 2)
        self.assertEqual(self.cms['bar'], 1)

    def test_update_unicode(self):
        tuple = ('foo', 'bar', u'foo')
        self.cms.update(tuple)
        self.assertEqual(self.cms['foo'], 2)
        self.assertEqual(self.cms[u'foo'], 2)

    def test_update_list(self):
        self.cms.update([str(i % 3) for i in range(5)])
        self.assertEqual(self.cms['0'], 2)
        self.assertEqual(self.cms['1'], 2)
        self.assertEqual(self.cms['2'], 1)

    def test_update_split(self):
        self.cms.update("This is a sentence".split())
        self.assertEqual(self.cms['is'], 1)
        self.assertEqual(self.cms['this'], 0)  # lowercase

    def test_update_twice(self):
        tuple = ('foo', 'bar', 'foo')
        self.cms.update(tuple)
        self.cms.update(('foo', 'bar', 'foo'))
        self.assertEqual(self.cms['foo'], 4)
        self.assertEqual(self.cms['bar'], 2)

    def test_update_with_dictionary(self):
        """
        Update with a dictionary and test against it using set representation
        """
        data = {'a': 1, 'b': 3, 'c': 2, 'd': 5}

        self.cms.update(data)

        self.assertEqual(self.cms['b'], 3)

        result_set = set()
        for key, expected_value in data.items():
            result_set.add((key, self.cms[key]))

        self.assertEqual(set(result_set), set(data.items()))

    def test_update_with_cms(self):
        """
        Update with a dictionary and test against it using set representation
        The log variants are only precise up to 2048 (16), so we don't use larger values here
        """
        data1 = {'a': 1, 'b': 3, 'c': 2, 'd': 5}
        data2 = {'a': 15, 'b': 4, 'c': 6, 'e': 13}
        expected = {'a': 16, 'b': 7, 'c': 8, 'd': 5, 'e': 13}

        self.cms.update(data1)
        cms2 = CountMinSketch(1, log_counting=self.log_counting)
        cms2.update(data2)
        self.cms.update(cms2)

        result_set = set()
        for key, expected_value in expected.items():
            result_set.add((key, self.cms[key]))

        self.assertEqual(set(result_set), set(expected.items()))
Example #10
0
class CountMinSketchSanityCommonTest(unittest.TestCase):
    """
    Functional tests for setting and retrieving values of the counter
    """
    def __init__(self, methodName='runTest', log_counting=None, delta=0.0):
        self.log_counting = log_counting
        self.delta = delta
        super(CountMinSketchSanityCommonTest,
              self).__init__(methodName=methodName)

    def setUp(self):
        self.cms = CountMinSketch(1, log_counting=self.log_counting)

    def test_unknown_is_zero(self):
        self.assertEqual(self.cms['foo'], 0)

    def test_increment_default(self):
        self.cms.increment('foo')
        self.cms.increment('bar')
        self.cms.increment('foo')
        self.cms.increment('foo')

        self.assertEqual(self.cms['foo'], 3)
        self.assertEqual(self.cms['bar'], 1)

    def test_increment_bytes(self):
        self.cms.increment('foo')
        self.cms.increment('bar')
        self.cms.increment(b'foo')
        self.cms.increment('foo')

        self.assertEqual(self.cms['foo'], 3)
        self.assertEqual(self.cms[b'foo'], 3)

    def test_total(self):
        self.assertEqual(self.cms.total(), 0)

        self.cms.increment('foo')
        self.cms.increment('bar')
        self.cms.increment('foo')
        self.cms.increment('foo')
        self.assertEqual(self.cms.total(), 4)

        self.cms.increment('goo', 3)
        self.assertEqual(self.cms.total(), 7)

    def test_cardinality(self):
        self.assertEqual(self.cms.cardinality(), 0)

        self.cms.increment('foo')
        self.cms.increment('bar')
        self.cms.increment('foo')
        self.cms.increment('foo')
        self.assertEqual(self.cms.cardinality(), 2)

        self.cms.increment('goo', 3)
        self.assertEqual(self.cms.cardinality(), 3)

    def test_increment_by_value(self):
        foo_value = 42
        bar_value = 53

        self.cms.increment('foo', foo_value)
        self.cms.increment('bar', bar_value)

        self.assertAlmostEqual(self.cms['foo'],
                               foo_value,
                               delta=self.delta * foo_value)
        self.assertAlmostEqual(self.cms['bar'],
                               bar_value,
                               delta=self.delta * bar_value)

    def test_repeat_increment(self):
        """
        Test that a set successfully replaces existing value of the counter
        """

        self.cms.increment('foo', 5)
        self.cms.increment('foo', 10)

        self.assertEqual(self.cms['foo'], 15)

    def test_increment_int_key(self):
        """
        Negative test: integer keys are not supported and yield TypeError
        """
        with self.assertRaises(TypeError):
            self.cms.increment(1)

    def test_get_increment_object_key(self):
        """
        Negative test: object keys are not supported and yield TypeError
        """
        o = MyClass()

        with self.assertRaises(TypeError):
            self.cms.increment(o)

    def test_get_increment_empty_string(self):
        self.cms.increment('foo', 42)
        self.cms.increment('bar', 53)

        self.assertEqual(self.cms[''], 0)
        self.cms.increment('', 3)
        self.assertEqual(self.cms[''], 3)
        self.cms.increment('')
        self.assertEqual(self.cms[''], 4)

    def test_get_increment_long_string(self):
        long_string = 'l' + ('o' * 100) + 'ng'
        longer_string = 'l' + ('o' * 120) + 'ng'
        self.cms.increment(long_string, 2)
        self.cms.increment(longer_string, 3)

        self.assertEqual(self.cms[long_string], 2)
        self.assertEqual(self.cms[longer_string], 3)

    def test_get_increment_non_ascii_string(self):
        non_ascii_string = "Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☹ "
        # the second line contains a different symbol
        similar_string = "Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☺ "

        self.cms.increment(non_ascii_string, 2)
        self.cms.increment(similar_string, 3)

        self.assertEqual(self.cms[non_ascii_string], 2)
        self.assertEqual(self.cms[similar_string], 3)

    def test_get_increment_non_ascii_unicode(self):
        non_ascii_unicode = u"Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☹ "
        # the second line contains a different symbol
        similar_unicode = u"Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☺ "

        self.cms.increment(non_ascii_unicode, 2)
        self.cms.increment(similar_unicode, 3)

        self.assertEqual(self.cms[non_ascii_unicode], 2)
        self.assertEqual(self.cms[similar_unicode], 3)

    def test_increment_string_value(self):
        """
        Negative test: string values are not supported and yield TypeError
        """
        with self.assertRaises(TypeError):
            self.cms.increment('foo', 'bar')

    def test_set_object_value(self):
        """
        Negative test: object values are not supported and yield TypeError
        """
        class MyClass(object):
            pass

        with self.assertRaises(TypeError):
            self.cms.increment('foo', MyClass())

    def test_increment_big_number(self):
        big_number = 127451
        self.cms.increment('big number', big_number)
        self.assertAlmostEqual(self.cms['big number'],
                               big_number,
                               delta=self.delta * big_number)

    def test_increment_negative(self):
        """
        Negative test, raises ValueError on negative values
        """
        # new value
        with self.assertRaises(ValueError):
            self.cms.increment('foo', -4)

        self.assertEqual(self.cms['foo'], 0, "value should remain unaffected")

        self.cms.increment('foo', 3)
        # existing value
        with self.assertRaises(ValueError):
            self.cms.increment('foo', -2)

        self.assertEqual(self.cms['foo'], 3, "value should remain unaffected")

    def test_increment_zero(self):
        """
        Setting the zero value
        """
        self.cms.increment('foo', 0)
        self.assertEqual(self.cms['foo'], 0)

        self.cms.increment('foo')
        self.cms.increment('foo', 0)
        self.assertEqual(self.cms['foo'], 1)
Example #11
0
 def test_largest_cms(self):
     cms = CountMinSketch(size_mb=16384, log_counting=8)
Example #12
0
 def test_invalid_sizemb(self):
     with self.assertRaises(ValueError):
         CountMinSketch(0.5)
Example #13
0
 def test_invalid_algorithm(self):
     data = ['basic', 'log8', 'cons', 'logcounter', 5]
     for bad_algorithm in data:
         with self.assertRaises(ValueError):
             CountMinSketch(1, log_counting=bad_algorithm)
Example #14
0
class CountMinSketchPickleCommonTest(unittest.TestCase):
    """
    Functional tests for determining size (cardinality) of hashtable and iterations.
    """

    def __init__(self, methodName='runTest', log_counting=None):
        self.log_counting = log_counting
        super(CountMinSketchPickleCommonTest, self).__init__(methodName=methodName)

    def setUp(self):
        self.cms = CountMinSketch(2, log_counting=self.log_counting)

    def tearDown(self):
        if os.path.isfile(filename):
            os.remove(filename)

    def store_and_load(self):
        with open(filename, 'wb') as outfile:
            pickle.dump(self.cms, outfile)

        with open(filename, 'rb') as outfile:
            reloaded = pickle.load(outfile)

        return reloaded

    def check_cms(self, cms, data):
        self.assertAlmostEqual(cms.cardinality(), len(data))
        self.assertEqual(cms.total(), sum(data.values()))

        result_set = set()
        for key, expected_value in data.items():
            result_set.add((key, cms[key]))

        self.assertEqual(result_set, set(data.items()))

    def test_pickle_empty(self):
        reloaded = self.store_and_load()
        self.check_cms(reloaded, {})

    def test_pickle_simple(self):
        expected = Counter()
        for structure in [self.cms, expected]:
            structure.update("pickling")
            structure.update("lorem ipsum dolor amet")
            structure.update("122333444455555666666")

        self.check_cms(self.cms, expected)

        reloaded = self.store_and_load()
        self.check_cms(reloaded, expected)

    def test_pickle_increment_after_reload(self):
        expected = Counter()
        for structure in [self.cms, expected]:
            structure.update("pickling")
        self.cms.increment('1')
        self.cms.increment('2', 2)
        expected['1'] += 1
        expected['2'] += 2

        self.check_cms(self.cms, expected)

        reloaded = self.store_and_load()

        for structure in [reloaded, expected]:
            structure.update("pickling")
        reloaded.increment('1', 1)
        reloaded.increment('3', 3)
        expected['1'] += 1
        expected['3'] += 3
        self.check_cms(reloaded, expected)