def test_table_function(self): table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.register_function( "multi_emit", udtf(MultiEmit(), [DataTypes.BIGINT(), DataTypes.BIGINT()], [DataTypes.BIGINT(), DataTypes.BIGINT()])) self.t_env.register_function("condition_multi_emit", condition_multi_emit) self.t_env.register_function( "multi_num", udf(MultiNum(), [DataTypes.BIGINT()], DataTypes.BIGINT())) t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c']) t.join_lateral("multi_emit(a, multi_num(b)) as (x, y)") \ .left_outer_join_lateral("condition_multi_emit(x, y) as m") \ .select("x, y, m") \ .insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, [ "1,0,null", "1,1,null", "2,0,null", "2,1,null", "3,0,0", "3,0,1", "3,0,2", "3,1,1", "3,1,2", "3,2,2", "3,3,null" ])
def test_table_function(self): self._register_table_sink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) multi_emit = udtf( MultiEmit(), result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()]) multi_num = udf(MultiNum(), result_type=DataTypes.BIGINT()) t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c']) t = t.join_lateral( multi_emit((t.a + t.a) / 2, multi_num(t.b)).alias('x', 'y')) t = t.left_outer_join_lateral(condition_multi_emit(t.x, t.y).alias('m')) \ .select("x, y, m") t = t.left_outer_join_lateral(identity(t.m).alias('n')) \ .select("x, y, n") actual = self._get_output(t) self.assert_equals(actual, [ "+I[1, 0, null]", "+I[1, 1, null]", "+I[2, 0, null]", "+I[2, 1, null]", "+I[3, 0, 0]", "+I[3, 0, 1]", "+I[3, 0, 2]", "+I[3, 1, 1]", "+I[3, 1, 2]", "+I[3, 2, 2]", "+I[3, 3, null]" ])
def test_table_function(self): self._register_table_sink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_function( "multi_emit", udtf(MultiEmit(), result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()])) self.t_env.register_function("condition_multi_emit", condition_multi_emit) self.t_env.register_function( "multi_num", udf(MultiNum(), result_type=DataTypes.BIGINT())) t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c']) t = t.join_lateral("multi_emit(a, multi_num(b)) as (x, y)") \ .left_outer_join_lateral("condition_multi_emit(x, y) as m") \ .select("x, y, m") actual = self._get_output(t) self.assert_equals(actual, [ "1,0,null", "1,1,null", "2,0,null", "2,1,null", "3,0,0", "3,0,1", "3,0,2", "3,1,1", "3,1,2", "3,2,2", "3,3,null" ])
def test_execute_from_json_plan(self): # create source file path tmp_dir = self.tempdir data = ['1,1', '3,2', '2,1'] source_path = tmp_dir + '/test_execute_from_json_plan_input.csv' sink_path = tmp_dir + '/test_execute_from_json_plan_out' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') source_table = """ CREATE TABLE source_table ( a BIGINT, b BIGINT ) WITH ( 'connector' = 'filesystem', 'path' = '%s', 'format' = 'csv' ) """ % source_path self.t_env.execute_sql(source_table) self.t_env.execute_sql(""" CREATE TABLE sink_table ( a BIGINT, b BIGINT, c BIGINT ) WITH ( 'connector' = 'filesystem', 'path' = '%s', 'format' = 'csv' ) """ % sink_path) self.t_env.create_temporary_system_function( "multi_emit", udtf(MultiEmit(), result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()])) json_plan = self.t_env._j_tenv.compilePlanSql( "INSERT INTO sink_table " "SELECT a, x, y FROM source_table " "LEFT JOIN LATERAL TABLE(multi_emit(a, b))" " as T(x, y)" " ON TRUE") from py4j.java_gateway import get_method get_method(self.t_env._j_tenv.executePlan(json_plan), "await")() import glob lines = [ line.strip() for file in glob.glob(sink_path + '/*') for line in open(file, 'r') ] lines.sort() self.assertEqual(lines, ['1,1,0', '2,2,0', '3,3,0', '3,3,1'])
def test_row_type_as_input_types_and_result_types(self): # test input_types and result_types are DataTypes.ROW a = udtf(lambda i: i, input_types=DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())]), result_types=DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])) self.assertEqual(a._input_types, [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])]) self.assertEqual(a._result_types, [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])])
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() table = input_list[0] t_env.register_function( "search", udtf(SearchUDTF1(self.path, self.element_type), DataTypes.STRING(), DataTypes.STRING())) return [ table.join_lateral("search(feature_data) as near_id").select( "face_id, near_id") ]
def transform(self, *inputs: Table) -> List[Table]: table = inputs[0] row_data_type = table.get_schema().to_row_data_type() field_types = row_data_type.field_types() field_names = row_data_type.field_names() table = table.flat_map( udtf(f=Predict(os.path.join(self.path, "model_data"), self.predict_col_names), result_types=field_types + self.predict_data_types)) table = table.alias(*field_names, *self.predict_col_names) return [table]
def test_table_function_with_sql_query(self): self._register_table_sink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.create_temporary_system_function( "multi_emit", udtf(MultiEmit(), result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()])) t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c']) self.t_env.register_table("MyTable", t) t = self.t_env.sql_query( "SELECT a, x, y FROM MyTable LEFT JOIN LATERAL TABLE(multi_emit(a, b)) as T(x, y)" " ON TRUE") actual = self._get_output(t) self.assert_equals(actual, ["1,1,0", "2,2,0", "3,3,0", "3,3,1"])
def test_table_function_with_sql_query(self): table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.register_function( "multi_emit", udtf(MultiEmit(), [DataTypes.BIGINT(), DataTypes.BIGINT()], [DataTypes.BIGINT(), DataTypes.BIGINT()])) t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c']) self.t_env.register_table("MyTable", t) self.t_env.sql_query( "SELECT a, x, y FROM MyTable LEFT JOIN LATERAL TABLE(multi_emit(a, b)) as T(x, y)" " ON TRUE") \ .insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1,0", "2,2,0", "3,3,0", "3,3,1"])
word STRING, length INT ) WITH ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = './output/udtf.output2' ) """) my_table = table_env.from_path("mySource") # configure the off-heap memory of current taskmanager to enable the python worker uses off-heap memory. table_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '80m') # register the Python Table Function table_env.register_function( "split", udtf(Split(), DataTypes.STRING(), [DataTypes.STRING(), DataTypes.INT()])) # use the Python Table Function in Python Table API # my_table.join_lateral("split(a) as (word, length)").insert_into("mySink1") my_table.join_lateral("split(a) as (word, length)").execute_insert( "mySink1").get_job_client().get_job_execution_result().result() # my_table.left_outer_join_lateral("split(a) as (word, length)").execute_insert( # "mySink2").get_job_client().get_job_execution_result().result() # my_table.left_outer_join_lateral("split(a) as (word, length)") # table_env.execute("udtf job")