def test_udf_without_arguments(self): one = udf(lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True) two = udf(lambda: 2, result_type=DataTypes.BIGINT(), deterministic=False) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(one(), two()).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2]", "+I[1, 2]", "+I[1, 2]"])
def test_udf_without_arguments(self): one = udf(lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True) two = udf(lambda: 2, result_type=DataTypes.BIGINT(), deterministic=False) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(one(), two()).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2]", "+I[1, 2]", "+I[1, 2]"])
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
def test_set_environment(self): python_exec = sys.executable tmp_dir = self.tempdir python_exec_link_path = os.path.join(tmp_dir, "py_exec") os.symlink(python_exec, python_exec_link_path) self.t_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.t_env.create_temporary_system_function( "check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): try: from pyflink.java_gateway import get_gateway get_gateway() except Exception as e: assert str(e).startswith( "It's launching the PythonGatewayServer during Python UDF" " execution which is unexpected.") else: raise Exception("The gateway server is not disabled!") return i self.t_env.create_temporary_system_function( "check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select( expr.call('check_python_exec', t.a), expr.call('check_pyflink_gateway_disabled', t.a)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_scalar_function(self): # test metric disabled. self.t_env.get_config().set('python.metric.enabled', 'false') # test lambda function add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) # test Python ScalarFunction subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) # test callable function add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT()) def partial_func(col, param): return col + param # test partial function import functools add_one_partial = udf(functools.partial(partial_func, param=1), result_type=DataTypes.BIGINT()) # check memory limit is set @udf(result_type=DataTypes.BIGINT()) def check_memory_limit(exec_mode): if exec_mode == "process": assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None return 1 sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT, c BIGINT, d BIGINT, e BIGINT, f BIGINT, g BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) execution_mode = self.t_env.get_config().get("python.execution-mode", "process") t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.where(add_one(t.b) <= 3).select( add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a), add_one_partial(t.a), check_memory_limit(execution_mode), t.a) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals( actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() table = input_list[0] t_env.register_function( "search", udf(SearchUDTF(self.path, self.element_type), DataTypes.STRING(), DataTypes.STRING())) return [table.select("face_id, search(feature_data) as near_id")]
def test_chaining_scalar_function(self): self.t_env.register_function( "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT())) self.t_env.register_function( "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())) self.t_env.register_function("add", add) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t.select("add(add_one(a), subtract_one(b)), c, 1") \ .insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
def test_map_with_pandas_udf(self): t = self.t_env.from_elements( [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD( "b", DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.INT()) ])) ])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) def func(x): import pandas as pd res = pd.concat([x.a, x.c + x.d], axis=1) return res def func2(x): return x * 2 pandas_udf = udf(func, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ]), func_type='pandas') pandas_udf_2 = udf(func2, result_type=DataTypes.ROW([ DataTypes.FIELD("c", DataTypes.BIGINT()), DataTypes.FIELD("d", DataTypes.BIGINT()) ]), func_type='pandas') t.map(pandas_udf).map(pandas_udf_2).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["4,8", "2,10", "2,28", "2,18", "4,14"])
def test_basic_functionality(self): # pandas UDF add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), func_type="pandas") # general Python UDF subtract_one = udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.where(add_one(t.b) <= 3) \ .select(t.a, t.b + 1, add(t.a + 1, subtract_one(t.c)) + 2, add(add_one(t.a), 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 3, 6, 3]", "+I[3, 2, 14, 5]"])
def test_basic_functionality(self): self.t_env.create_temporary_system_function( "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), udf_type="pandas")) self.t_env.create_temporary_system_function("add", add) # general Python UDF self.t_env.create_temporary_system_function( "subtract_one", udf(SubtractOne(), result_type=DataTypes.BIGINT())) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t = t.where("add_one(b) <= 3") \ .select("a, b + 1, add(a + 1, subtract_one(c)) + 2, add(add_one(a), 1L)") result = self.collect(t) self.assert_equals(result, ["1,3,6,3", "3,2,14,5"])
def test_udf_without_arguments(self): self.t_env.register_function( "one", udf(lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True)) self.t_env.register_function( "two", udf(lambda: 2, result_type=DataTypes.BIGINT(), deterministic=False)) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select("one(), two()").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "1,2", "1,2"])
def test_set_environment(self): if getattr(os, "symlink", None) is None: self.skipTest("Symbolic link is not supported, skip testing 'test_set_python_exec'...") python_exec = sys.executable tmp_dir = self.tempdir python_exec_link_path = os.path.join(tmp_dir, "py_exec") os.symlink(python_exec, python_exec_link_path) self.t_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.t_env.register_function("check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): try: from pyflink.java_gateway import get_gateway get_gateway() except Exception as e: assert str(e).startswith("It's launching the PythonGatewayServer during Python UDF" " execution which is unexpected.") else: raise Exception("The gateway server is not disabled!") return i self.t_env.register_function("check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select("check_python_exec(a), check_pyflink_gateway_disabled(a)").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,2", "3,3"])
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() table = input_list[0] t_env.register_function( "predict", udf(f=PredictFunction(None), input_types=[DataTypes.STRING()], result_type=DataTypes.STRING())) return [table.select('face_id, predict(feature_data) as label')]
def test_scalar_function(self): # test metric disabled. self.t_env.get_config().get_configuration().set_string( 'python.metric.enabled', 'false') # test lambda function add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) # test Python ScalarFunction subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) # test callable function add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT()) def partial_func(col, param): return col + param # test partial function import functools add_one_partial = udf(functools.partial(partial_func, param=1), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e', 'f'], [ DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) exec_insert_table( t.where(add_one(t.b) <= 3).select(add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a), add_one_partial(t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2,1,4,2,2,1", "4,0,12,4,4,3"])
def test_open(self): self.t_env.get_config().set('python.metric.enabled', 'true') execution_mode = self.t_env.get_config().get("python.execution-mode", None) if execution_mode == "process": subtract = udf(SubtractWithMetrics(), result_type=DataTypes.BIGINT()) else: subtract = udf(Subtract(), result_type=DataTypes.BIGINT()) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b']) t.select(t.a, subtract(t.b)).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 4]", "+I[3, 3]"])
def test_open(self): self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true') subtract = udf(Subtract(), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b']) t.select(t.a, subtract(t.b)).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 4]", "+I[3, 3]"])
def test_deterministic(self): add_one = udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT()) self.assertTrue(add_one._deterministic) add_one = udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT(), deterministic=False) self.assertFalse(add_one._deterministic) subtract_one = udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()) self.assertTrue(subtract_one._deterministic) with self.assertRaises(ValueError, msg="Inconsistent deterministic: False and True"): udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT(), deterministic=False) self.assertTrue(add._deterministic) @udf(input_types=DataTypes.BIGINT(), result_type=DataTypes.BIGINT(), deterministic=False) def non_deterministic_udf(i): return i self.assertFalse(non_deterministic_udf._deterministic)
def test_invalid_udf(self): class Plus(object): def eval(self, col): return col + 1 with self.assertRaises( TypeError, msg="Invalid function: not a function or callable (__call__ is not defined)"): # test non-callable function self.t_env.register_function( "non-callable-udf", udf(Plus(), DataTypes.BIGINT(), DataTypes.BIGINT()))
def process(self, execution_context: flink.ExecutionContext, input_list: List[Table] = None) -> List[Table]: execution_context.table_env.register_function( "sleep_func", udf(SleepUDF(), input_types=[DataTypes.STRING()], result_type=DataTypes.STRING())) return [ input_list[0].group_by('word').select('sleep_func(word), count(1)') ]
def test_name(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) self.assertEqual("<lambda>", add_one._name) add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), name="add_one") self.assertEqual("add_one", add_one._name) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) self.assertEqual("SubtractOne", subtract_one._name) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT(), name="subtract_one") self.assertEqual("subtract_one", subtract_one._name) self.assertEqual("add", add._name) @udf(result_type=DataTypes.BIGINT(), name="named") def named_udf(i): return i self.assertEqual("named", named_udf._name)
def test_open(self): self.t_env.register_function( "subtract", udf(Subtract(), DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b']) t.select("a, subtract(b)").insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,4", "3,3"])
def execute(self, function_context: FlinkFunctionContext, input_list: List[Table]) -> List[Table]: t_env = function_context.get_table_env() table = input_list[0] Popen('rm -rf /root/test', shell=True) t_env.register_function( "search", udf(SearchUDTF3(self.path, self.element_type), DataTypes.STRING(), DataTypes.INT())) return [ table.select("face_id, device_id, search(feature_data) as near_id") ]
def test_overwrite_builtin_function(self): self.t_env.create_temporary_system_function( "plus", udf(lambda i, j: i + j - 1, result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) exec_insert_table(t.select("plus(a, b)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["2", "6", "3"])
def test_open(self): self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true') self.t_env.create_temporary_system_function( "subtract", udf(Subtract(), result_type=DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b']) exec_insert_table(t.select("a, subtract(b)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,1", "2,4", "3,3"])
def test_scalar_function(self): # test metric disabled. self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'false') # test lambda function add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) # test Python ScalarFunction subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) # test callable function add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT()) def partial_func(col, param): return col + param # test partial function import functools add_one_partial = udf(functools.partial(partial_func, param=1), result_type=DataTypes.BIGINT()) # check memory limit is set @udf(result_type=DataTypes.BIGINT()) def check_memory_limit(): assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None return 1 table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e', 'f', 'g'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.where(add_one(t.b) <= 3).select( add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a), add_one_partial(t.a), check_memory_limit(), t.a) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
def test_scalar_function(self): # test lambda function self.t_env.register_function( "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT())) # test Python ScalarFunction self.t_env.register_function( "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())) # test Python function self.t_env.register_function("add", add) # test callable function self.t_env.register_function( "add_one_callable", udf(CallablePlus(), DataTypes.BIGINT(), DataTypes.BIGINT())) def partial_func(col, param): return col + param # test partial function import functools self.t_env.register_function( "add_one_partial", udf(functools.partial(partial_func, param=1), DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) t.where("add_one(b) <= 3") \ .select("add_one(a), subtract_one(b), add(a, c), add_one_callable(a), " "add_one_partial(a)") \ .insert_into("Results") self.t_env.execute("test") actual = source_sink_utils.results() self.assert_equals(actual, ["2,1,4,2,2", "4,0,12,4,4"])
def test_group_aggregate_function(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()) ])) table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c'], [ DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT()) ]) ]) self.t_env.register_table_sink("Results", table_sink) # general udf add = udf(lambda a: a + 1, result_type=DataTypes.INT()) # pandas udf substract = udf(lambda a: a - 1, result_type=DataTypes.INT(), func_type="pandas") max_udaf = udaf(lambda a: (a.max(), a.min()), result_type=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.INT()), DataTypes.FIELD("b", DataTypes.INT()) ]), func_type="pandas") t.group_by("a") \ .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[1, 6.0, +I[5, 2]]", "+I[2, 3.0, +I[3, 2]]", "+I[3, 3.0, +I[2, 2]]" ])
def test_execute_from_json_plan(self): # create source file path tmp_dir = self.tempdir data = ['1,1', '3,3', '2,2'] source_path = tmp_dir + '/test_execute_from_json_plan_input.csv' sink_path = tmp_dir + '/test_execute_from_json_plan_out' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') source_table = """ CREATE TABLE source_table ( a BIGINT, b BIGINT ) WITH ( 'connector' = 'filesystem', 'path' = '%s', 'format' = 'csv' ) """ % source_path self.t_env.execute_sql(source_table) self.t_env.execute_sql(""" CREATE TABLE sink_table ( id BIGINT, data BIGINT ) WITH ( 'connector' = 'filesystem', 'path' = '%s', 'format' = 'csv' ) """ % sink_path) add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) self.t_env.create_temporary_system_function("add_one", add_one) json_plan = self.t_env._j_tenv.getJsonPlan( "INSERT INTO sink_table SELECT " "a, " "add_one(b) " "FROM source_table") from py4j.java_gateway import get_method get_method(self.t_env._j_tenv.executeJsonPlan(json_plan), "await")() import glob lines = [ line.strip() for file in glob.glob(sink_path + '/*') for line in open(file, 'r') ] lines.sort() self.assertEqual(lines, ['1,2', '2,3', '3,4'])
def test_add_python_file(self): import uuid env = self.env python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dep1.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") def plus_two_map(value): from test_dep1 import add_two return add_two(value) get_j_env_configuration(env._j_stream_execution_environment).\ setString("taskmanager.numberOfTaskSlots", "10") env.add_python_file(python_file_path) ds = env.from_collection([1, 2, 3, 4, 5]) ds = ds.map(plus_two_map, Types.LONG()) \ .slot_sharing_group("data_stream") \ .map(lambda i: i, Types.LONG()) \ .slot_sharing_group("table") python_file_path = os.path.join(python_file_dir, "test_dep2.py") with open(python_file_path, 'w') as f: f.write("def add_three(a):\n return a + 3") def plus_three(value): from test_dep2 import add_three return add_three(value) t_env = StreamTableEnvironment.create( stream_execution_environment=env, environment_settings=EnvironmentSettings.in_streaming_mode()) env.add_python_file(python_file_path) from pyflink.table.udf import udf from pyflink.table.expressions import col add_three = udf(plus_three, result_type=DataTypes.BIGINT()) tab = t_env.from_data_stream(ds, col('a')) \ .select(add_three(col('a'))) t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \ .map(lambda i: i[0]) \ .add_sink(self.test_sink) env.execute("test add_python_file") result = self.test_sink.get_results(True) expected = ['6', '7', '8', '9', '10'] result.sort() expected.sort() self.assertEqual(expected, result)
def test_set_requirements_with_cached_directory(self): tmp_dir = self.tempdir requirements_txt_path = os.path.join( tmp_dir, "requirements_txt_" + str(uuid.uuid4())) with open(requirements_txt_path, 'w') as f: f.write("python-package1==0.0.0") requirements_dir_path = os.path.join( tmp_dir, "requirements_dir_" + str(uuid.uuid4())) os.mkdir(requirements_dir_path) package_file_name = "python-package1-0.0.0.tar.gz" with open(os.path.join(requirements_dir_path, package_file_name), 'wb') as f: import base64 # This base64 data is encoded from a python package file which includes a # "python_package1" module. The module contains a "plus(a, b)" function. # The base64 can be recomputed by following code: # base64.b64encode(open("python-package1-0.0.0.tar.gz", "rb").read()).decode("utf-8") f.write( base64.b64decode( "H4sICNefrV0C/2Rpc3QvcHl0aG9uLXBhY2thZ2UxLTAuMC4wLnRhcgDtmVtv2jAYhnPtX2H1CrRCY+ckI" "XEx7axuUA11u5imyICTRc1JiVnHfv1MKKWjYxwKEdPehws7xkmUfH5f+3PyqfqWpa1cjG5EKFnLbOvfhX" "FQTI3nOPPSdavS5Pa8nGMwy3Esi3ke9wyTObbnGNQxamBSKlFQavzUryG8ldG6frpbEGx4yNmDLMp/hPy" "P8b+6fNN613vdP1z8XdteG3+ug/17/F3Hcw1qIv5H54NUYiyUaH2SRRllaYeytkl6IpEdujI2yH2XapCQ" "wSRJRDHt0OveZa//uUfeZonUvUO5bHo+0ZcoVo9bMhFRvGx9H41kWj447aUsR0WUq+pui8arWKggK5Jli" "wGOo/95q79ovXi6/nfyf246Dof/n078fT9KI+X77Xx6BP83bX4Xf5NxT7dz7toO/L8OxjKgeTwpG+KcDp" "sdQjWFVJMipYI+o0MCk4X/t2UYtqI0yPabCHb3f861XcD/Ty/+Y5nLdCzT0dSPo/SmbKsf6un+b7KV+Ls" "W4/D/OoC9w/930P9eGwM75//csrD+Q/6P/P/k9D/oX3988Wqw1bS/tf6tR+s/m3EG/ddBqXO9XKf15C8p" "P9k4HZBtBgzZaVW5vrfKcj+W32W82ygEB9D/Xu9+4/qfP9L/rBv0X1v87yONKRX61/qfzwqjIDzIPTbv/" "7or3/88i0H/tfBFW7s/s/avRInQH06ieEy7tDrQeYHUdRN7wP+n/vf62LOH/pld7f9xz7a5Pfufedy0oP" "86iJI8KxStAq6yLC4JWdbbVbWRikR2z1ZGytk5vauW3QdnBFE6XqwmykazCesAAAAAAAAAAAAAAAAAAAA" "AAAAAAAAAAAAAAOBw/AJw5CHBAFAAAA==")) self.st_env.set_python_requirements(requirements_txt_path, requirements_dir_path) def add_one(i): from python_package1 import plus return plus(i, 1) self.st_env.create_temporary_system_function( "add_one", udf(add_one, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.st_env.register_table_sink("Results", table_sink) t = self.st_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(expr.call('add_one', t.a), t.a).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2, 1]", "+I[3, 2]", "+I[4, 3]"])