Esempio n. 1
0
    def test_dict_sketch_str_value(self):
        # Dict value sketch type should be auto inferred
        dict_data = [{
            'a': 'b',
            'b': 'c'
        }, {
            'a': 'b',
            'b': 'c'
        }, {
            'a': 'd',
            'b': '4'
        }, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 2)
        self.assertEqual(fi['{"a":"b", "b":"c"}'], 2)
        self.assertEqual(fi['{"a":"d", "b":"4"}'], 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(
            sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)

        # allow pass in empty keys, which will retrieve all keys
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch()
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 2
0
    def test_dict_sketch_int_value(self):
        dict_data = [{}, {
            'a': 1,
            'b': 2
        }, {
            'a': 1,
            'b': 2
        }, {
            'a': 3,
            'c': 1
        }, {
            'a': 1,
            'b': 2,
            'c': 3
        }, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        self.assertEqual(sketch.num_unique(), 4)
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['{"a":1, "b":2}']), 2)
        self.assertEqual((fi['{"a":3, "c":1}']), 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(
            itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(
            sub_sketch_keys='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys=keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 3
0
    def test_dict_sketch_str_value(self):
        # Dict value sketch type should be auto inferred
        dict_data = [{'a':'b', 'b':'c'}, {'a':'b', 'b':'c'}, {'a':'d', 'b':'4'}, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 2)
        self.assertEqual(fi['{"a":"b", "b":"c"}'], 2)
        self.assertEqual(fi['{"a":"d", "b":"4"}'], 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)

        # allow pass in empty keys, which will retrieve all keys
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch()
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)
Esempio n. 4
0
    def test_dict_sketch_int_value(self):
        dict_data = [{}, {'a':1, 'b':2}, {'a':1, 'b':2}, {'a':3, 'c':1}, {'a': 1, 'b': 2, 'c': 3}, None]
        sa = SArray(data=dict_data)
        self.__validate_nested_sketch_result(sa)

        sketch = sa.sketch_summary()
        self.assertEqual(sketch.num_unique(), 4)
        fi = sketch.frequent_items()
        self.assertEqual(len(fi), 4)
        self.assertEqual((fi['{"a":1, "b":2}']), 2)
        self.assertEqual((fi['{"a":3, "c":1}']), 1)

        # Get dict key sketch
        key_summary = sketch.dict_key_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_keys().dropna())))
        self.__validate_sketch_result(key_summary, SArray(another_rep))

        # Get dict value sketch
        value_summary = sketch.dict_value_summary()
        another_rep = list(itertools.chain.from_iterable(list(sa.dict_values().dropna())))
        self.__validate_sketch_result(value_summary, SArray(another_rep))

        # sub sketch with one key
        s = sa.sketch_summary(sub_sketch_keys ='a').element_sub_sketch('a')
        expected = sa.unpack(column_name_prefix="")['a']
        self.__validate_sketch_result(s, expected)

        s = sa.sketch_summary(sub_sketch_keys ='Nonexist').element_sub_sketch('Nonexist')
        self.assertEqual(s.num_undefined(), len(sa))

        # sub sketch with multiple keys
        keys = ['a', 'b']
        s = sa.sketch_summary(sub_sketch_keys =keys).element_sub_sketch(keys)
        self.assertEqual(len(s), len(keys))
        for key in keys:
            self.assertTrue(s.has_key(key))
            expected = sa.unpack(column_name_prefix="")[key]
            self.__validate_sketch_result(s[key], expected)