def setUpClass(cls): super(PandasConversionTestBase, cls).setUpClass() cls.data = [(1, 1, 1, 1, True, 1.1, 1.2, 'hello', bytearray(b"aaa"), decimal.Decimal('1000000000000000000.01'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'], Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), d=[1, 2])), (1, 2, 2, 2, False, 2.1, 2.2, 'world', bytearray(b"bbb"), decimal.Decimal('1000000000000000000.02'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'], Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), d=[1, 2]))] cls.data_type = DataTypes.ROW( [DataTypes.FIELD("f1", DataTypes.TINYINT()), DataTypes.FIELD("f2", DataTypes.SMALLINT()), DataTypes.FIELD("f3", DataTypes.INT()), DataTypes.FIELD("f4", DataTypes.BIGINT()), DataTypes.FIELD("f5", DataTypes.BOOLEAN()), DataTypes.FIELD("f6", DataTypes.FLOAT()), DataTypes.FIELD("f7", DataTypes.DOUBLE()), DataTypes.FIELD("f8", DataTypes.STRING()), DataTypes.FIELD("f9", DataTypes.BYTES()), DataTypes.FIELD("f10", DataTypes.DECIMAL(38, 18)), DataTypes.FIELD("f11", DataTypes.DATE()), DataTypes.FIELD("f12", DataTypes.TIME()), DataTypes.FIELD("f13", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("f14", DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD("f15", DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.STRING()), DataTypes.FIELD("c", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("d", DataTypes.ARRAY(DataTypes.INT()))]))], False) cls.pdf = cls.create_pandas_data_frame()
def _create_judf(self, serialized_func, j_input_types, j_function_kind): if self._func_type == "pandas": from pyflink.table.types import DataTypes self._accumulator_type = DataTypes.ARRAY(self._result_type) if j_input_types is not None: gateway = get_gateway() j_input_types = java_utils.to_jarray( gateway.jvm.DataType, [_to_java_data_type(i) for i in self._input_types]) j_result_type = _to_java_data_type(self._result_type) j_accumulator_type = _to_java_data_type(self._accumulator_type) gateway = get_gateway() if self._is_table_aggregate: PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonTableAggregateFunction else: PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonAggregateFunction j_aggregate_function = PythonAggregateFunction( self._name, bytearray(serialized_func), j_input_types, j_result_type, j_accumulator_type, j_function_kind, self._deterministic, self._takes_row_as_input, _get_python_env()) return j_aggregate_function
def test_nested_udt_in_df(self): expected_schema = DataTypes.ROW() \ .add("_1", DataTypes.BIGINT()).add("_2", DataTypes.ARRAY(PythonOnlyUDT())) data = (1, [PythonOnlyPoint(float(1), float(2))]) self.assertEqual(expected_schema, _infer_type(data)) expected_schema = DataTypes.ROW().add("_1", DataTypes.BIGINT()).add( "_2", DataTypes.MAP(DataTypes.BIGINT(False), PythonOnlyUDT())) p = (1, {1: PythonOnlyPoint(1, float(2))}) self.assertEqual(expected_schema, _infer_type(p))
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD( 'string_array', DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList') ), DataTypes.FIELD( 'int_array', DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList') ), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, data
def test_array_type(self): # nullable/not_null flag will be lost during the conversion. test_types = [DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def test_array_type(self): test_types = [DataTypes.ARRAY(DataTypes.BIGINT()), # array type with not null basic data type means primitive array DataTypes.ARRAY(DataTypes.BIGINT().not_null()), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))] java_types = [_to_java_type(item) for item in test_types] converted_python_types = [_from_java_type(item) for item in java_types] self.assertEqual(test_types, converted_python_types)
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('string_array', DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) conversion_row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.OBJECT_ARRAY(Types.STRING()), Types.OBJECT_ARRAY(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, conversion_row_type_info, data
def _create_judf(self, serialized_func, j_input_types, j_function_kind): if self._func_type == "pandas": from pyflink.table.types import DataTypes self._accumulator_type = DataTypes.ARRAY(self._result_type) j_result_type = _to_java_type(self._result_type) j_accumulator_type = _to_java_type(self._accumulator_type) gateway = get_gateway() PythonAggregateFunction = gateway.jvm \ .org.apache.flink.table.functions.python.PythonAggregateFunction j_aggregate_function = PythonAggregateFunction( self._name, bytearray(serialized_func), j_input_types, j_result_type, j_accumulator_type, j_function_kind, self._deterministic, _get_python_env()) return j_aggregate_function
def add_array_column(self, name: str, separator: str = ';', element_type: Optional[DataType] = DataTypes.STRING()) \ -> 'CsvSchemaBuilder': """ Add an array column to schema, the type of elements could be specified via ``element_type``, which should be primitive types. :param name: Name of the column. :param separator: Text separator of array elements, default to ``;``. :param element_type: DataType of array elements, default to ``DataTypes.STRING()``. """ self._j_schema_builder.addArrayColumn(name, separator) self._fields.append( DataTypes.FIELD(name, DataTypes.ARRAY(element_type))) return self
def sql_type(cls): return DataTypes.ARRAY(DataTypes.DOUBLE(False))
def test_verify_type_not_nullable(self): import array import datetime import decimal schema = DataTypes.ROW([ DataTypes.FIELD('s', DataTypes.STRING(nullable=False)), DataTypes.FIELD('i', DataTypes.INT(True)) ]) class MyObj: def __init__(self, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) # obj, data_type success_spec = [ # String ("", DataTypes.STRING()), (u"", DataTypes.STRING()), # UDT (ExamplePoint(1.0, 2.0), ExamplePointUDT()), # Boolean (True, DataTypes.BOOLEAN()), # TinyInt (-(2**7), DataTypes.TINYINT()), (2**7 - 1, DataTypes.TINYINT()), # SmallInt (-(2**15), DataTypes.SMALLINT()), (2**15 - 1, DataTypes.SMALLINT()), # Int (-(2**31), DataTypes.INT()), (2**31 - 1, DataTypes.INT()), # BigInt (2**64, DataTypes.BIGINT()), # Float & Double (1.0, DataTypes.FLOAT()), (1.0, DataTypes.DOUBLE()), # Decimal (decimal.Decimal("1.0"), DataTypes.DECIMAL(10, 0)), # Binary (bytearray([1]), DataTypes.BINARY(1)), # Date/Time/Timestamp (datetime.date(2000, 1, 2), DataTypes.DATE()), (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.DATE()), (datetime.time(1, 1, 2), DataTypes.TIME()), (datetime.datetime(2000, 1, 2, 3, 4), DataTypes.TIMESTAMP()), # Array ([], DataTypes.ARRAY(DataTypes.INT())), (["1", None], DataTypes.ARRAY(DataTypes.STRING(nullable=True))), ([1, 2], DataTypes.ARRAY(DataTypes.INT())), ((1, 2), DataTypes.ARRAY(DataTypes.INT())), (array.array('h', [1, 2]), DataTypes.ARRAY(DataTypes.INT())), # Map ({}, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())), ({ "a": 1 }, DataTypes.MAP(DataTypes.STRING(), DataTypes.INT())), ({ "a": None }, DataTypes.MAP(DataTypes.STRING(nullable=False), DataTypes.INT(True))), # Struct ({ "s": "a", "i": 1 }, schema), ({ "s": "a", "i": None }, schema), ({ "s": "a" }, schema), ({ "s": "a", "f": 1.0 }, schema), (Row(s="a", i=1), schema), (Row(s="a", i=None), schema), (Row(s="a", i=1, f=1.0), schema), (["a", 1], schema), (["a", None], schema), (("a", 1), schema), (MyObj(s="a", i=1), schema), (MyObj(s="a", i=None), schema), (MyObj(s="a"), schema), ] # obj, data_type, exception class failure_spec = [ # Char/VarChar (match anything but None) (None, DataTypes.VARCHAR(1), ValueError), (None, DataTypes.CHAR(1), ValueError), # VarChar (length exceeds maximum length) ("abc", DataTypes.VARCHAR(1), ValueError), # Char (length exceeds length) ("abc", DataTypes.CHAR(1), ValueError), # UDT (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError), # Boolean (1, DataTypes.BOOLEAN(), TypeError), ("True", DataTypes.BOOLEAN(), TypeError), ([1], DataTypes.BOOLEAN(), TypeError), # TinyInt (-(2**7) - 1, DataTypes.TINYINT(), ValueError), (2**7, DataTypes.TINYINT(), ValueError), ("1", DataTypes.TINYINT(), TypeError), (1.0, DataTypes.TINYINT(), TypeError), # SmallInt (-(2**15) - 1, DataTypes.SMALLINT(), ValueError), (2**15, DataTypes.SMALLINT(), ValueError), # Int (-(2**31) - 1, DataTypes.INT(), ValueError), (2**31, DataTypes.INT(), ValueError), # Float & Double (1, DataTypes.FLOAT(), TypeError), (1, DataTypes.DOUBLE(), TypeError), # Decimal (1.0, DataTypes.DECIMAL(10, 0), TypeError), (1, DataTypes.DECIMAL(10, 0), TypeError), ("1.0", DataTypes.DECIMAL(10, 0), TypeError), # Binary (1, DataTypes.BINARY(1), TypeError), # VarBinary (length exceeds maximum length) (bytearray([1, 2]), DataTypes.VARBINARY(1), ValueError), # Char (length exceeds length) (bytearray([1, 2]), DataTypes.BINARY(1), ValueError), # Date/Time/Timestamp ("2000-01-02", DataTypes.DATE(), TypeError), ("10:01:02", DataTypes.TIME(), TypeError), (946811040, DataTypes.TIMESTAMP(), TypeError), # Array (["1", None], DataTypes.ARRAY(DataTypes.VARCHAR(1, nullable=False)), ValueError), ([1, "2"], DataTypes.ARRAY(DataTypes.INT()), TypeError), # Map ({ "a": 1 }, DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), TypeError), ({ "a": "1" }, DataTypes.MAP(DataTypes.VARCHAR(1), DataTypes.INT()), TypeError), ({ "a": None }, DataTypes.MAP(DataTypes.VARCHAR(1), DataTypes.INT(False)), ValueError), # Struct ({ "s": "a", "i": "1" }, schema, TypeError), (Row(s="a"), schema, ValueError), # Row can't have missing field (Row(s="a", i="1"), schema, TypeError), (["a"], schema, ValueError), (["a", "1"], schema, TypeError), (MyObj(s="a", i="1"), schema, TypeError), (MyObj(s=None, i="1"), schema, ValueError), ] # Check success cases for obj, data_type in success_spec: try: _create_type_verifier(data_type.not_null())(obj) except (TypeError, ValueError): self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type)) # Check failure cases for obj, data_type, exp in failure_spec: msg = "verify_type(%s, %s, nullable=False) == %s" % ( obj, data_type, exp) with self.assertRaises(exp, msg=msg): _create_type_verifier(data_type.not_null())(obj)
def test_merge_type(self): self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.NULL()), DataTypes.BIGINT()) self.assertEqual(_merge_type(DataTypes.NULL(), DataTypes.BIGINT()), DataTypes.BIGINT()) self.assertEqual(_merge_type(DataTypes.BIGINT(), DataTypes.BIGINT()), DataTypes.BIGINT()) self.assertEqual( _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.ARRAY(DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.ARRAY(DataTypes.DOUBLE())) self.assertEqual( _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT())) with self.assertRaises(TypeError): _merge_type(DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.BIGINT()), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.DOUBLE()), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW([DataTypes.FIELD('f2', DataTypes.BIGINT())])) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.BIGINT())])) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ROW( [DataTypes.FIELD('f2', DataTypes.STRING())])) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD('f1', DataTypes.ARRAY(DataTypes.DOUBLE())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT())), DataTypes.FIELD('f2', DataTypes.STRING()) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE())), DataTypes.FIELD('f2', DataTypes.STRING()) ])) self.assertEqual( _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ])), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ])) with self.assertRaises(TypeError): _merge_type( DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.STRING(), DataTypes.BIGINT()))) ]), DataTypes.ROW([ DataTypes.FIELD( 'f1', DataTypes.ARRAY( DataTypes.MAP(DataTypes.DOUBLE(), DataTypes.BIGINT()))) ]))
def get_accumulator_type(self): return DataTypes.ARRAY(DataTypes.BIGINT())
def word_count(): environment_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = BatchTableEnvironment.create( environment_settings=environment_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) # we should set the Python verison here if `Python` not point t_env.get_config().set_python_executable("python3") t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .with_schema(Schema() .field("city", DataTypes.STRING()) .field("sales_volume", DataTypes.BIGINT()) .field("sales", DataTypes.BIGINT())) \ .register_table_sink("Results") @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] t_env.register_function("split", split) t_env.register_function("get", get) t_env.get_config().get_configuration().set_string("parallelism.default", "1") data = [ ("iPhone 11,30,5499,Beijing", ), ("iPhone 11 Pro,20,8699,Guangzhou", ), ("MacBook Pro,10,9999,Beijing", ), ("AirPods Pro,50,1999,Beijing", ), ("MacBook Pro,10,11499,Shanghai", ), ("iPhone 11,30,5999,Shanghai", ), ("iPhone 11 Pro,20,9999,Shenzhen", ), ("MacBook Pro,10,13899,Hangzhou", ), ("iPhone 11,10,6799,Beijing", ), ("MacBook Pro,10,18999,Beijing", ), ("iPhone 11 Pro,10,11799,Shenzhen", ), ("MacBook Pro,10,22199,Shanghai", ), ("AirPods Pro,40,1999,Shanghai", ) ] t_env.from_elements(data, ["line"]) \ .select("split(line) as str_array") \ .select("get(str_array, 3) as city, " "get(str_array, 1).cast(LONG) as count, " "get(str_array, 2).cast(LONG) as unit_price") \ .select("city, count, count * unit_price as total_price") \ .group_by("city") \ .select("city, " "sum(count) as sales_volume, " "sum(total_price) as sales") \ .insert_into("Results") t_env.execute("word_count")
def test(): # 1. create a TableEnvironment #env_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build() #table_env = StreamTableEnvironment.create(environment_settings=env_settings) env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() table_env = BatchTableEnvironment.create(environment_settings=env_settings) # 2. create source Table table_env.execute_sql(""" CREATE TABLE source_table ( Region VARCHAR, Country VARCHAR, Item_Type VARCHAR, Sales_Channel VARCHAR, Order_Priority VARCHAR, Order_Date VARCHAR, Order_ID VARCHAR, Ship_Date VARCHAR, Units_Sold VARCHAR, Unit_Price VARCHAR, Unit_Cost VARCHAR, Total_Revenue VARCHAR, Total_Cost VARCHAR, Total_Profit VARCHAR ) WITH ( 'connector' = 'filesystem', 'path' = '/tmp/data/5m_Sales_Records.csv', 'format' = 'csv' ) """) table_env.execute_sql(""" CREATE TABLE sink_table ( Region VARCHAR, Country VARCHAR, Item_Type VARCHAR, Sales_Channel VARCHAR, Order_Priority VARCHAR, Order_Date VARCHAR, Order_ID VARCHAR, Ship_Date VARCHAR, Units_Sold VARCHAR, Unit_Price VARCHAR, Unit_Cost VARCHAR, Total_Revenue VARCHAR, Total_Cost VARCHAR, Total_Profit VARCHAR ) WITH ( 'connector' = 'filesystem', 'path' = '/tmp/data/xxx_Sales_Records.csv', 'format' = 'csv' ) """) @udf(input_types=DataTypes.STRING(), result_type=DataTypes.ARRAY(DataTypes.STRING())) def split(input_str: str): return input_str.split(",") @udf(input_types=[DataTypes.ARRAY(DataTypes.STRING()), DataTypes.INT()], result_type=DataTypes.STRING()) def get(arr, index): return arr[index] table_env.register_function("split", split) table_env.register_function("get", get) table_env.sql_query("SELECT * FROM source_table order by Region") \ .execute_insert("sink_table").wait()