Python _infer_schema_from_dataの例、pyflink.table.types._infer_schema_from_data Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

    def test_infer_nested_schema(self):
        NestedRow = Row("f1", "f2")
        data1 = [
            NestedRow([1, 2], {"row1": 1.0}),
            NestedRow([2, 3], {"row2": 2.0})
        ]
        schema1 = _infer_schema_from_data(data1)
        expected1 = [
            'ArrayType(BigIntType(true), true)',
            'MapType(VarCharType(2147483647, false), DoubleType(true), true)'
        ]
        self.assertEqual(expected1,
                         [repr(f.data_type) for f in schema1.fields])

        data2 = [
            NestedRow([[1, 2], [2, 3]], [1, 2]),
            NestedRow([[2, 3], [3, 4]], [2, 3])
        ]
        schema2 = _infer_schema_from_data(data2)
        expected2 = [
            'ArrayType(ArrayType(BigIntType(true), true), true)',
            'ArrayType(BigIntType(true), true)'
        ]
        self.assertEqual(expected2,
                         [repr(f.data_type) for f in schema2.fields])

コード例 #2

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

    def test_infer_schema_nulltype(self):
        elements = [Row(c1=[], c2={}, c3=None),
                    Row(c1=[Row(a=1, b='s')], c2={"key": Row(c=1.0, d="2")}, c3="")]
        schema = _infer_schema_from_data(elements)
        self.assertTrue(isinstance(schema, RowType))
        self.assertEqual(3, len(schema.fields))

        # first column is array
        self.assertTrue(isinstance(schema.fields[0].data_type, ArrayType))

        # element type of first column is struct
        self.assertTrue(isinstance(schema.fields[0].data_type.element_type, RowType))

        self.assertTrue(isinstance(schema.fields[0].data_type.element_type.fields[0].data_type,
                                   BigIntType))
        self.assertTrue(isinstance(schema.fields[0].data_type.element_type.fields[1].data_type,
                                   VarCharType))

        # second column is map
        self.assertTrue(isinstance(schema.fields[1].data_type, MapType))
        self.assertTrue(isinstance(schema.fields[1].data_type.key_type, VarCharType))
        self.assertTrue(isinstance(schema.fields[1].data_type.value_type, RowType))

        # third column is varchar
        self.assertTrue(isinstance(schema.fields[2].data_type, VarCharType))

コード例 #3

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

    def test_infer_schema_nulltype(self):
        elements = [
            Row(c1=[], c2={}, c3=None),
            Row(c1=[Row(a=1, b='s')], c2={"key": Row(c=1.0, d="2")}, c3="")
        ]
        schema = _infer_schema_from_data(elements)
        self.assertTrue(isinstance(schema, RowType))
        self.assertEqual(3, len(schema.fields))

        # first column is array
        self.assertTrue(isinstance(schema.fields[0].data_type, ArrayType))

        # element type of first column is struct
        self.assertTrue(
            isinstance(schema.fields[0].data_type.element_type, RowType))

        self.assertTrue(
            isinstance(
                schema.fields[0].data_type.element_type.fields[0].data_type,
                BigIntType))
        self.assertTrue(
            isinstance(
                schema.fields[0].data_type.element_type.fields[1].data_type,
                VarCharType))

        # second column is map
        self.assertTrue(isinstance(schema.fields[1].data_type, MapType))
        self.assertTrue(
            isinstance(schema.fields[1].data_type.key_type, VarCharType))
        self.assertTrue(
            isinstance(schema.fields[1].data_type.value_type, RowType))

        # third column is varchar
        self.assertTrue(isinstance(schema.fields[2].data_type, VarCharType))

コード例 #4

0

ファイルを表示

    def from_elements(self, elements, schema=None, verify_schema=True):
        """
        Creates a table from a collection of elements.

        Example:
        ::

            >>> table_env.from_elements([(1, 'Hi'), (2, 'Hello')], ['a', 'b'])

        :param elements: The elements to create a table from.
        :param schema: The schema of the table.
        :param verify_schema: Whether to verify the elements against the schema.
        :return: The result :class:`Table`.
        """

        # verifies the elements against the specified schema
        if isinstance(schema, RowType):
            verify_func = _create_type_verifier(
                schema) if verify_schema else lambda _: True

            def verify_obj(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            data_type = schema
            schema = RowType().add("value", schema)

            verify_func = _create_type_verifier(
                data_type,
                name="field value") if verify_schema else lambda _: True

            def verify_obj(obj):
                verify_func(obj)
                return obj
        else:

            def verify_obj(obj):
                return obj

        if "__len__" not in dir(elements):
            elements = list(elements)

        # infers the schema if not specified
        if schema is None or isinstance(schema, (list, tuple)):
            schema = _infer_schema_from_data(elements, names=schema)
            converter = _create_converter(schema)
            elements = map(converter, elements)
            if isinstance(schema, (list, tuple)):
                for i, name in enumerate(schema):
                    schema.fields[i].name = name
                    schema.names[i] = name

        elif not isinstance(schema, RowType):
            raise TypeError(
                "schema should be RowType, list, tuple or None, but got: %s" %
                schema)

        # converts python data to sql data
        elements = [schema.to_sql_type(element) for element in elements]
        return self._from_elements(map(verify_obj, elements), schema)

コード例 #5

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

    def test_infer_nested_schema(self):
        NestedRow = Row("f1", "f2")
        data1 = [NestedRow([1, 2], {"row1": 1.0}), NestedRow([2, 3], {"row2": 2.0})]
        schema1 = _infer_schema_from_data(data1)
        expected1 = [
            'ArrayType(BigIntType(true), true)',
            'MapType(VarCharType(2147483647, false), DoubleType(true), true)'
        ]
        self.assertEqual(expected1, [repr(f.data_type) for f in schema1.fields])

        data2 = [NestedRow([[1, 2], [2, 3]], [1, 2]), NestedRow([[2, 3], [3, 4]], [2, 3])]
        schema2 = _infer_schema_from_data(data2)
        expected2 = [
            'ArrayType(ArrayType(BigIntType(true), true), true)',
            'ArrayType(BigIntType(true), true)'
        ]
        self.assertEqual(expected2, [repr(f.data_type) for f in schema2.fields])

コード例 #6

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

    def test_infer_schema(self):
        from decimal import Decimal

        class A(object):
            def __init__(self):
                self.a = 1

        from collections import namedtuple
        Point = namedtuple('Point', 'x y')

        data = [
            True,
            1,
            "a",
            u"a",
            datetime.date(1970, 1, 1),
            datetime.time(0, 0, 0),
            datetime.datetime(1970, 1, 1, 0, 0),
            1.0,
            array.array("d", [1]),
            [1],
            (1, ),
            Point(1.0, 5.0),
            {
                "a": 1
            },
            bytearray(1),
            Decimal(1),
            Row(a=1),
            Row("a")(1),
            A(),
        ]

        expected = [
            'BooleanType(true)',
            'BigIntType(true)',
            'VarCharType(2147483647, true)',
            'VarCharType(2147483647, true)',
            'DateType(true)',
            'TimeType(0, true)',
            'LocalZonedTimestampType(6, true)',
            'DoubleType(true)',
            "ArrayType(DoubleType(false), true)",
            "ArrayType(BigIntType(true), true)",
            'RowType(RowField(_1, BigIntType(true), ...))',
            'RowType(RowField(x, DoubleType(true), ...),RowField(y, DoubleType(true), ...))',
            'MapType(VarCharType(2147483647, false), BigIntType(true), true)',
            'VarBinaryType(2147483647, true)',
            'DecimalType(38, 18, true)',
            'RowType(RowField(a, BigIntType(true), ...))',
            'RowType(RowField(a, BigIntType(true), ...))',
            'RowType(RowField(a, BigIntType(true), ...))',
        ]

        schema = _infer_schema_from_data([data])
        self.assertEqual(expected, [repr(f.data_type) for f in schema.fields])

コード例 #7

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

 def test_infer_bigint_type(self):
     longrow = [Row(f1='a', f2=100000000000000)]
     schema = _infer_schema_from_data(longrow)
     self.assertEqual(DataTypes.BIGINT(), schema.fields[1].data_type)
     self.assertEqual(DataTypes.BIGINT(), _infer_type(1))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2 ** 10))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2 ** 20))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2 ** 31 - 1))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2 ** 31))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2 ** 61))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2 ** 71))

コード例 #8

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

    def test_infer_schema(self):
        from decimal import Decimal

        class A(object):
            def __init__(self):
                self.a = 1

        from collections import namedtuple
        Point = namedtuple('Point', 'x y')

        data = [
            True,
            1,
            "a",
            u"a",
            datetime.date(1970, 1, 1),
            datetime.time(0, 0, 0),
            datetime.datetime(1970, 1, 1, 0, 0),
            1.0,
            array.array("d", [1]),
            [1],
            (1,),
            Point(1.0, 5.0),
            {"a": 1},
            bytearray(1),
            Decimal(1),
            Row(a=1),
            Row("a")(1),
            A(),
        ]

        expected = [
            'BooleanType(true)',
            'BigIntType(true)',
            'VarCharType(2147483647, true)',
            'VarCharType(2147483647, true)',
            'DateType(true)',
            'TimeType(0, true)',
            'TimestampType(0, 6, true)',
            'DoubleType(true)',
            "ArrayType(DoubleType(false), true)",
            "ArrayType(BigIntType(true), true)",
            'RowType(RowField(_1, BigIntType(true), ...))',
            'RowType(RowField(x, DoubleType(true), ...),RowField(y, DoubleType(true), ...))',
            'MapType(VarCharType(2147483647, false), BigIntType(true), true)',
            'VarBinaryType(2147483647, true)',
            'DecimalType(38, 18, true)',
            'RowType(RowField(a, BigIntType(true), ...))',
            'RowType(RowField(a, BigIntType(true), ...))',
            'RowType(RowField(a, BigIntType(true), ...))',
        ]

        schema = _infer_schema_from_data([data])
        self.assertEqual(expected, [repr(f.data_type) for f in schema.fields])

コード例 #9

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

 def test_infer_bigint_type(self):
     longrow = [Row(f1='a', f2=100000000000000)]
     schema = _infer_schema_from_data(longrow)
     self.assertEqual(DataTypes.BIGINT(), schema.fields[1].data_type)
     self.assertEqual(DataTypes.BIGINT(), _infer_type(1))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2**10))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2**20))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2**31 - 1))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2**31))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2**61))
     self.assertEqual(DataTypes.BIGINT(), _infer_type(2**71))

コード例 #10

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

    def test_array_types(self):
        # This test need to make sure that the Scala type selected is at least
        # as large as the python's types. This is necessary because python's
        # array types depend on C implementation on the machine. Therefore there
        # is no machine independent correspondence between python's array types
        # and Scala types.
        # See: https://docs.python.org/2/library/array.html

        def assert_collect_success(typecode, value, element_type):
            self.assertEqual(element_type,
                             str(_infer_type(array.array(typecode, [value])).element_type))

        # supported string types
        #
        # String types in python's array are "u" for Py_UNICODE and "c" for char.
        # "u" will be removed in python 4, and "c" is not supported in python 3.
        supported_string_types = []
        if sys.version_info[0] < 4:
            supported_string_types += ['u']
            # test unicode
            assert_collect_success('u', u'a', 'CHAR')
        if sys.version_info[0] < 3:
            supported_string_types += ['c']
            # test string
            assert_collect_success('c', 'a', 'CHAR')

        # supported float and double
        #
        # Test max, min, and precision for float and double, assuming IEEE 754
        # floating-point format.
        supported_fractional_types = ['f', 'd']
        assert_collect_success('f', ctypes.c_float(1e+38).value, 'FLOAT')
        assert_collect_success('f', ctypes.c_float(1e-38).value, 'FLOAT')
        assert_collect_success('f', ctypes.c_float(1.123456).value, 'FLOAT')
        assert_collect_success('d', sys.float_info.max, 'DOUBLE')
        assert_collect_success('d', sys.float_info.min, 'DOUBLE')
        assert_collect_success('d', sys.float_info.epsilon, 'DOUBLE')

        def get_int_data_type(size):
            if size <= 8:
                return "TINYINT"
            if size <= 16:
                return "SMALLINT"
            if size <= 32:
                return "INT"
            if size <= 64:
                return "BIGINT"

        # supported signed int types
        #
        # The size of C types changes with implementation, we need to make sure
        # that there is no overflow error on the platform running this test.
        supported_signed_int_types = list(
            set(_array_signed_int_typecode_ctype_mappings.keys()).intersection(
                set(_array_type_mappings.keys())))
        for t in supported_signed_int_types:
            ctype = _array_signed_int_typecode_ctype_mappings[t]
            max_val = 2 ** (ctypes.sizeof(ctype) * 8 - 1)
            assert_collect_success(t, max_val - 1, get_int_data_type(ctypes.sizeof(ctype) * 8))
            assert_collect_success(t, -max_val, get_int_data_type(ctypes.sizeof(ctype) * 8))

        # supported unsigned int types
        #
        # JVM does not have unsigned types. We need to be very careful to make
        # sure that there is no overflow error.
        supported_unsigned_int_types = list(
            set(_array_unsigned_int_typecode_ctype_mappings.keys()).intersection(
                set(_array_type_mappings.keys())))
        for t in supported_unsigned_int_types:
            ctype = _array_unsigned_int_typecode_ctype_mappings[t]
            max_val = 2 ** (ctypes.sizeof(ctype) * 8 - 1)
            assert_collect_success(t, max_val, get_int_data_type(ctypes.sizeof(ctype) * 8 + 1))

        # all supported types
        #
        # Make sure the types tested above:
        # 1. are all supported types
        # 2. cover all supported types
        supported_types = (supported_string_types +
                           supported_fractional_types +
                           supported_signed_int_types +
                           supported_unsigned_int_types)
        self.assertEqual(set(supported_types), set(_array_type_mappings.keys()))

        # all unsupported types
        #
        # Keys in _array_type_mappings is a complete list of all supported types,
        # and types not in _array_type_mappings are considered unsupported.
        # `array.typecodes` are not supported in python 2.
        if sys.version_info[0] < 3:
            all_types = {'c', 'b', 'B', 'u', 'h', 'H', 'i', 'I', 'l', 'L', 'f', 'd'}
        else:
            all_types = set(array.typecodes)
        unsupported_types = all_types - set(supported_types)
        # test unsupported types
        for t in unsupported_types:
            with self.assertRaises(TypeError):
                _infer_schema_from_data([Row(myarray=array.array(t))])

コード例 #11

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

 def test_infer_schema_fails(self):
     with self.assertRaises(TypeError):
         _infer_schema_from_data([[1, 1], ["x", 1]], names=["a", "b"])

コード例 #12

0

ファイルを表示

ファイル: test_types.py プロジェクト: apache/flink

 def test_infer_schema_not_enough_names(self):
     schema = _infer_schema_from_data([["a", "b"]], ["col1"])
     self.assertTrue(schema.names, ['col1', '_2'])

コード例 #13

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

    def test_array_types(self):
        # This test need to make sure that the Scala type selected is at least
        # as large as the python's types. This is necessary because python's
        # array types depend on C implementation on the machine. Therefore there
        # is no machine independent correspondence between python's array types
        # and Scala types.
        # See: https://docs.python.org/2/library/array.html

        def assert_collect_success(typecode, value, element_type):
            self.assertEqual(
                element_type,
                str(_infer_type(array.array(typecode, [value])).element_type))

        # supported string types
        #
        # String types in python's array are "u" for Py_UNICODE and "c" for char.
        # "u" will be removed in python 4, and "c" is not supported in python 3.
        supported_string_types = []
        if sys.version_info[0] < 4:
            supported_string_types += ['u']
            # test unicode
            assert_collect_success('u', u'a', 'CHAR')

        # supported float and double
        #
        # Test max, min, and precision for float and double, assuming IEEE 754
        # floating-point format.
        supported_fractional_types = ['f', 'd']
        assert_collect_success('f', ctypes.c_float(1e+38).value, 'FLOAT')
        assert_collect_success('f', ctypes.c_float(1e-38).value, 'FLOAT')
        assert_collect_success('f', ctypes.c_float(1.123456).value, 'FLOAT')
        assert_collect_success('d', sys.float_info.max, 'DOUBLE')
        assert_collect_success('d', sys.float_info.min, 'DOUBLE')
        assert_collect_success('d', sys.float_info.epsilon, 'DOUBLE')

        def get_int_data_type(size):
            if size <= 8:
                return "TINYINT"
            if size <= 16:
                return "SMALLINT"
            if size <= 32:
                return "INT"
            if size <= 64:
                return "BIGINT"

        # supported signed int types
        #
        # The size of C types changes with implementation, we need to make sure
        # that there is no overflow error on the platform running this test.
        supported_signed_int_types = list(
            set(_array_signed_int_typecode_ctype_mappings.keys()).intersection(
                set(_array_type_mappings.keys())))
        for t in supported_signed_int_types:
            ctype = _array_signed_int_typecode_ctype_mappings[t]
            max_val = 2**(ctypes.sizeof(ctype) * 8 - 1)
            assert_collect_success(t, max_val - 1,
                                   get_int_data_type(ctypes.sizeof(ctype) * 8))
            assert_collect_success(t, -max_val,
                                   get_int_data_type(ctypes.sizeof(ctype) * 8))

        # supported unsigned int types
        #
        # JVM does not have unsigned types. We need to be very careful to make
        # sure that there is no overflow error.
        supported_unsigned_int_types = list(
            set(_array_unsigned_int_typecode_ctype_mappings.keys()).
            intersection(set(_array_type_mappings.keys())))
        for t in supported_unsigned_int_types:
            ctype = _array_unsigned_int_typecode_ctype_mappings[t]
            max_val = 2**(ctypes.sizeof(ctype) * 8 - 1)
            assert_collect_success(
                t, max_val, get_int_data_type(ctypes.sizeof(ctype) * 8 + 1))

        # all supported types
        #
        # Make sure the types tested above:
        # 1. are all supported types
        # 2. cover all supported types
        supported_types = (supported_string_types +
                           supported_fractional_types +
                           supported_signed_int_types +
                           supported_unsigned_int_types)
        self.assertEqual(set(supported_types),
                         set(_array_type_mappings.keys()))

        # all unsupported types
        #
        # Keys in _array_type_mappings is a complete list of all supported types,
        # and types not in _array_type_mappings are considered unsupported.
        all_types = set(array.typecodes)
        unsupported_types = all_types - set(supported_types)
        # test unsupported types
        for t in unsupported_types:
            with self.assertRaises(TypeError):
                _infer_schema_from_data([Row(myarray=array.array(t))])

コード例 #14

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

 def test_infer_schema_fails(self):
     with self.assertRaises(TypeError):
         _infer_schema_from_data([[1, 1], ["x", 1]], names=["a", "b"])

コード例 #15

0

ファイルを表示

ファイル: test_types.py プロジェクト: zjureel/flink

 def test_infer_schema_not_enough_names(self):
     schema = _infer_schema_from_data([["a", "b"]], ["col1"])
     self.assertTrue(schema.names, ['col1', '_2'])

コード例 #16

0

ファイルを表示

    def from_elements(self, elements, schema=None, verify_schema=True):
        """
        Creates a table from a collection of elements.
        The elements types must be acceptable atomic types or acceptable composite types.
        All elements must be of the same type.
        If the elements types are composite types, the composite types must be strictly equal,
        and its subtypes must also be acceptable types.
        e.g. if the elements are tuples, the length of the tuples must be equal, the element types
        of the tuples must be equal in order.

        The built-in acceptable atomic element types contains:

        **int**, **long**, **str**, **unicode**, **bool**,
        **float**, **bytearray**, **datetime.date**, **datetime.time**, **datetime.datetime**,
        **datetime.timedelta**, **decimal.Decimal**

        The built-in acceptable composite element types contains:

        **list**, **tuple**, **dict**, **array**, :class:`pyflink.table.Row`

        If the element type is a composite type, it will be unboxed.
        e.g. table_env.from_elements([(1, 'Hi'), (2, 'Hello')]) will return a table like:

        +----+-------+
        | _1 |  _2   |
        +====+=======+
        | 1  |  Hi   |
        +----+-------+
        | 2  | Hello |
        +----+-------+

        "_1" and "_2" are generated field names.

        Example:
        ::

            # use the second parameter to specify custom field names
            >>> table_env.from_elements([(1, 'Hi'), (2, 'Hello')], ['a', 'b'])
            # use the second parameter to specify custom table schema
            >>> table_env.from_elements([(1, 'Hi'), (2, 'Hello')],
            ...                         DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()),
            ...                                        DataTypes.FIELD("b", DataTypes.STRING())]))
            # use the thrid parameter to switch whether to verify the elements against the schema
            >>> table_env.from_elements([(1, 'Hi'), (2, 'Hello')],
            ...                         DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT()),
            ...                                        DataTypes.FIELD("b", DataTypes.STRING())]),
            ...                         False)

        :param elements: The elements to create a table from.
        :param schema: The schema of the table.
        :param verify_schema: Whether to verify the elements against the schema.
        :return: The result :class:`Table`.
        """

        # verifies the elements against the specified schema
        if isinstance(schema, RowType):
            verify_func = _create_type_verifier(
                schema) if verify_schema else lambda _: True

            def verify_obj(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            data_type = schema
            schema = RowType().add("value", schema)

            verify_func = _create_type_verifier(
                data_type,
                name="field value") if verify_schema else lambda _: True

            def verify_obj(obj):
                verify_func(obj)
                return obj
        else:

            def verify_obj(obj):
                return obj

        if "__len__" not in dir(elements):
            elements = list(elements)

        # infers the schema if not specified
        if schema is None or isinstance(schema, (list, tuple)):
            schema = _infer_schema_from_data(elements, names=schema)
            converter = _create_converter(schema)
            elements = map(converter, elements)
            if isinstance(schema, (list, tuple)):
                for i, name in enumerate(schema):
                    schema.fields[i].name = name
                    schema.names[i] = name

        elif not isinstance(schema, RowType):
            raise TypeError(
                "schema should be RowType, list, tuple or None, but got: %s" %
                schema)

        # verifies the elements against the specified schema
        elements = map(verify_obj, elements)
        # converts python data to sql data
        elements = [schema.to_sql_type(element) for element in elements]
        return self._from_elements(elements, schema)