Esempio n. 1
0
    def test_vector_sketch(self):
        vector_data = [[], [1,2], [3], [4,5,6,7], [8,9,10], None]
        sa = SArray(data=vector_data)

        sketch = sa.sketch_summary();
        self.__validate_sketch_result(sketch, sa)
        self.__validate_sketch_result(sketch.element_length_summary(), sa.dropna().item_length())

        flattened = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(sketch.element_summary(), SArray(flattened))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 5)
        self.assertEqual((fi['[1 2]']), 1)
        self.assertEqual((fi['[4 5 6 7]']), 1)

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys = 1).element_sub_sketch(1)
        expected = sa.vector_slice(1)
        self.__validate_sketch_result(s, expected)

        # sub sketch with multiple keys
        keys = [1,3]
        s = sa.sketch_summary(sub_sketch_keys = keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.vector_slice(key)
            self.__validate_sketch_result(s[key], expected)

        indexes = range(0,10)
        s = sa.sketch_summary(sub_sketch_keys = indexes).element_sub_sketch()
        self.assertEqual(len(s), len(indexes))
Esempio n. 2
0
    def test_vector_sketch(self):
        vector_data = [[], [1, 2], [3], [4, 5, 6, 7], [8, 9, 10], None]
        sa = SArray(data=vector_data)

        sketch = sa.sketch_summary()
        self.__validate_sketch_result(sketch, sa)
        self.__validate_sketch_result(sketch.element_length_summary(),
                                      sa.dropna().item_length())

        flattened = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(sketch.element_summary(),
                                      SArray(flattened))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 5)
        self.assertEqual((fi['[1 2]']), 1)
        self.assertEqual((fi['[4 5 6 7]']), 1)

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys=1).element_sub_sketch(1)
        expected = sa.vector_slice(1)
        self.__validate_sketch_result(s, expected)

        # sub sketch with multiple keys
        keys = [1, 3]
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.vector_slice(key)
            self.__validate_sketch_result(s[key], expected)

        indexes = range(0, 10)
        s = sa.sketch_summary(sub_sketch_keys=indexes).element_sub_sketch()
        self.assertEqual(len(s), len(indexes))
Esempio n. 3
0
    def test_dict_sketch_str_value(self):
        # Dict value sketch type should be auto inferred
        dict_data = [{
            'a': 'b',
            'b': 'c'
        }, {
            'a': 'b',
            'b': 'c'
        }, {
            'a': 'd',
            'b': '4'
        }, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 2)
        self.assertEqual(fi['{"a":"b", "b":"c"}'], 2)
        self.assertEqual(fi['{"a":"d", "b":"4"}'], 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(
            sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)

        # allow pass in empty keys, which will retrieve all keys
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch()
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 4
0
    def test_dict_sketch_int_value(self):
        dict_data = [{}, {
            'a': 1,
            'b': 2
        }, {
            'a': 1,
            'b': 2
        }, {
            'a': 3,
            'c': 1
        }, {
            'a': 1,
            'b': 2,
            'c': 3
        }, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        self.assertEqual(sketch.num_unique(), 4)
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['{"a":1, "b":2}']), 2)
        self.assertEqual((fi['{"a":3, "c":1}']), 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(
            sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 5
0
    def test_str_sketch(self):
        str_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", None]
        sa = SArray(data=str_data)
        sketch = sa.sketch_summary()
        with self.assertRaises(RuntimeError):
            sketch.min()
        with self.assertRaises(RuntimeError):
            sketch.max()
        with self.assertRaises(RuntimeError):
            sketch.sum()
        with self.assertRaises(RuntimeError):
            sketch.mean()
        with self.assertRaises(RuntimeError):
            sketch.var()
        with self.assertRaises(RuntimeError):
            sketch.std()

        self.assertAlmostEqual(sketch.num_unique(), 10, delta=3)
        self.assertEqual(sketch.num_undefined(), 1)
        self.assertEqual(sketch.size(), len(str_data))

        with self.assertRaises(RuntimeError):
            sketch.quantile(0.5)
        self.assertEqual(sketch.frequency_count("1"), 1)
        self.assertEqual(sketch.frequency_count("2"), 1)
        t = sketch.frequent_items()
        self.assertEqual(len(t), 10)
Esempio n. 6
0
 def test_background_sketch(self):
     dict_data = [{str(i):1} for i in range(1,10000)]
     sa = SArray(dict_data)
     s = sa.sketch_summary(background=True, sub_sketch_keys=[str(i ) for i in range(100,200)])
     s.sketch_ready() # cannot check the actual value as it depends on the speed of processing
     t = s.element_sub_sketch([str(i) for i in range(100, 105)])
     self.assertEqual(len(t), 5)
Esempio n. 7
0
    def test_str_sketch(self):
        str_data = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", None]
        sa = SArray(data=str_data)
        sketch = sa.sketch_summary()
        with self.assertRaises(RuntimeError):
          sketch.min()
        with self.assertRaises(RuntimeError):
          sketch.max()
        with self.assertRaises(RuntimeError):
          sketch.sum()
        with self.assertRaises(RuntimeError):
          sketch.mean()
        with self.assertRaises(RuntimeError):
          sketch.var()
        with self.assertRaises(RuntimeError):
          sketch.std()

        self.assertAlmostEqual(sketch.num_unique(), 10, delta=3)
        self.assertEqual(sketch.num_undefined(), 1)
        self.assertEqual(sketch.size(), len(str_data))

        with self.assertRaises(RuntimeError):
          sketch.quantile(0.5)
        self.assertEqual(sketch.frequency_count("1"), 1)
        self.assertEqual(sketch.frequency_count("2"), 1)
        t = sketch.frequent_items()
        self.assertEqual(len(t), 10)
Esempio n. 8
0
    def test_dict_sketch_str_value(self):
        # Dict value sketch type should be auto inferred
        dict_data = [{'a':'b', 'b':'c'}, {'a':'b', 'b':'c'}, {'a':'d', 'b':'4'}, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 2)
        self.assertEqual(fi['{"a":"b", "b":"c"}'], 2)
        self.assertEqual(fi['{"a":"d", "b":"4"}'], 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)

        # allow pass in empty keys, which will retrieve all keys
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch()
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 9
0
 def test_background_sketch(self):
     dict_data = [{str(i): 1} for i in range(1, 10000)]
     sa = SArray(dict_data)
     s = sa.sketch_summary(
         background=True, sub_sketch_keys=[str(i) for i in range(100, 200)])
     s.sketch_ready(
     )  # cannot check the actual value as it depends on the speed of processing
     t = s.element_sub_sketch([str(i) for i in range(100, 105)])
     self.assertEqual(len(t), 5)
Esempio n. 10
0
    def test_dict_sketch_int_value(self):
        dict_data = [{}, {'a':1, 'b':2}, {'a':1, 'b':2}, {'a':3, 'c':1}, {'a': 1, 'b': 2, 'c': 3}, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        self.assertEqual(sketch.num_unique(), 4)
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['{"a":1, "b":2}']), 2)
        self.assertEqual((fi['{"a":3, "c":1}']), 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 11
0
    def test_list_sketch(self):
        list_data = [[], [1,2],[1,2], ['a', 'a', 'a', 'b'], [ 1 ,1 , 2], None]
        sa = SArray(list_data)
        self.__validate_nested_sketch_result(sa)
        sketch = sa.sketch_summary();

        self.assertEqual(sketch.num_unique(), 4)
        element_summary = sketch.element_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(element_summary, SArray(another_rep, str))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['[1,2]']), 2)
        self.assertEqual((fi['["a","a","a","b"]']), 1)
Esempio n. 12
0
    def test_list_sketch(self):
        list_data = [[], [1, 2], [1, 2], ['a', 'a', 'a', 'b'], [1, 1, 2], None]
        sa = SArray(list_data)
        self.__validate_nested_sketch_result(sa)
        sketch = sa.sketch_summary()

        self.assertEqual(sketch.num_unique(), 4)
        element_summary = sketch.element_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dropna())))
        self.__validate_sketch_result(element_summary,
                                      SArray(another_rep, str))

        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['[1,2]']), 2)
        self.assertEqual((fi['["a","a","a","b"]']), 1)
Esempio n. 13
0
    def test_empty_sketch(self):
        int_data = []
        sa = SArray(data=int_data)
        sketch = sa.sketch_summary()
        self.assertTrue(math.isnan(sketch.min()))
        self.assertTrue(math.isnan(sketch.max()))
        self.assertEquals(sketch.sum(), 0)
        self.assertEqual(sketch.mean(), 0)
        self.assertEqual(sketch.var(), 0)
        self.assertEqual(sketch.std(), 0)
        self.assertEqual(sketch.num_unique(), 0)
        self.assertEqual(sketch.num_undefined(),0)
        self.assertEqual(sketch.size(), 0)
        with self.assertRaises(RuntimeError):
          sketch.quantile(0.5)

        t = sketch.frequent_items()
        self.assertEqual(len(t), 0)
Esempio n. 14
0
    def test_empty_sketch(self):
        int_data = []
        sa = SArray(data=int_data)
        sketch = sa.sketch_summary()
        self.assertTrue(math.isnan(sketch.min()))
        self.assertTrue(math.isnan(sketch.max()))
        self.assertEquals(sketch.sum(), 0)
        self.assertEqual(sketch.mean(), 0)
        self.assertEqual(sketch.var(), 0)
        self.assertEqual(sketch.std(), 0)
        self.assertEqual(sketch.num_unique(), 0)
        self.assertEqual(sketch.num_undefined(), 0)
        self.assertEqual(sketch.size(), 0)
        with self.assertRaises(RuntimeError):
            sketch.quantile(0.5)

        t = sketch.frequent_items()
        self.assertEqual(len(t), 0)
Esempio n. 15
0
 def test_sketch_float(self):
     int_data = [1.2, 3, .4, 6.789, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 16
0
 def test_sketch_int(self):
     int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 17
0
 def test_cancelation(self):
     sa = SArray(range(1, 10000))
     s = sa.sketch_summary(background=True)
     s.cancel()
Esempio n. 18
0
 def test_large_value_sketch(self):
     sa = SArray([1234567890 for i in range(100)])
     sk = sa.sketch_summary()
     self.__validate_sketch_result(sa.sketch_summary(), sa, 1E-5)
Esempio n. 19
0
 def test_large_value_sketch(self):
     sa = SArray([1234567890 for i in range(100)])
     sk = sa.sketch_summary();
     self.__validate_sketch_result(sa.sketch_summary(), sa, 1E-5)
Esempio n. 20
0
 def test_cancelation(self):
     sa = SArray(range(1,10000))
     s = sa.sketch_summary(background=True)
     s.cancel()
Esempio n. 21
0
 def test_sketch_int(self):
     int_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)
Esempio n. 22
0
 def test_sketch_float(self):
     int_data = [1.2, 3,.4, 6.789, None]
     sa = SArray(data=int_data)
     self.__validate_sketch_result(sa.sketch_summary(), sa)