Ejemplo n.º 1
0
    def test_udf_without_arguments(self):
        one = udf(lambda: 1,
                  result_type=DataTypes.BIGINT(),
                  deterministic=True)
        two = udf(lambda: 2,
                  result_type=DataTypes.BIGINT(),
                  deterministic=False)

        sink_table_ddl = """
                        CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink')
                        """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select(one(), two()).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 2]", "+I[1, 2]", "+I[1, 2]"])
Ejemplo n.º 2
0
    def test_udf_without_arguments(self):
        one = udf(lambda: 1,
                  result_type=DataTypes.BIGINT(),
                  deterministic=True)
        two = udf(lambda: 2,
                  result_type=DataTypes.BIGINT(),
                  deterministic=False)

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select(one(), two()).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 2]", "+I[1, 2]", "+I[1, 2]"])
Ejemplo n.º 3
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
Ejemplo n.º 4
0
    def test_set_environment(self):
        python_exec = sys.executable
        tmp_dir = self.tempdir
        python_exec_link_path = os.path.join(tmp_dir, "py_exec")
        os.symlink(python_exec, python_exec_link_path)
        self.t_env.get_config().set_python_executable(python_exec_link_path)

        def check_python_exec(i):
            import os
            assert os.environ["python"] == python_exec_link_path
            return i

        self.t_env.create_temporary_system_function(
            "check_python_exec",
            udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT()))

        def check_pyflink_gateway_disabled(i):
            try:
                from pyflink.java_gateway import get_gateway
                get_gateway()
            except Exception as e:
                assert str(e).startswith(
                    "It's launching the PythonGatewayServer during Python UDF"
                    " execution which is unexpected.")
            else:
                raise Exception("The gateway server is not disabled!")
            return i

        self.t_env.create_temporary_system_function(
            "check_pyflink_gateway_disabled",
            udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(),
                DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select(
            expr.call('check_python_exec', t.a),
            expr.call('check_pyflink_gateway_disabled', t.a)) \
            .execute_insert("Results").wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
Ejemplo n.º 5
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().set('python.metric.enabled', 'false')
        # test lambda function
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())

        # test Python ScalarFunction
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        # test callable function
        add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT())

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        add_one_partial = udf(functools.partial(partial_func, param=1),
                              result_type=DataTypes.BIGINT())

        # check memory limit is set
        @udf(result_type=DataTypes.BIGINT())
        def check_memory_limit(exec_mode):
            if exec_mode == "process":
                assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None
            return 1

        sink_table_ddl = """
            CREATE TABLE Results(a BIGINT, b BIGINT, c BIGINT, d BIGINT, e BIGINT, f BIGINT,
             g BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        execution_mode = self.t_env.get_config().get("python.execution-mode",
                                                     "process")

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.where(add_one(t.b) <= 3).select(
            add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a),
            add_one_partial(t.a), check_memory_limit(execution_mode), t.a) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
Ejemplo n.º 6
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     table = input_list[0]
     t_env.register_function(
         "search",
         udf(SearchUDTF(self.path, self.element_type), DataTypes.STRING(),
             DataTypes.STRING()))
     return [table.select("face_id, search(feature_data) as near_id")]
Ejemplo n.º 7
0
    def test_chaining_scalar_function(self):
        self.t_env.register_function(
            "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT()))
        self.t_env.register_function(
            "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()))
        self.t_env.register_function("add", add)

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c'])
        t.select("add(add_one(a), subtract_one(b)), c, 1") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
Ejemplo n.º 8
0
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)),
             (2, Row(3, 4))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD(
                    "b",
                    DataTypes.ROW([
                        DataTypes.FIELD("c", DataTypes.INT()),
                        DataTypes.FIELD("d", DataTypes.INT())
                    ]))
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW([
                             DataTypes.FIELD("c", DataTypes.BIGINT()),
                             DataTypes.FIELD("d", DataTypes.BIGINT())
                         ]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())
                           ]),
                           func_type='pandas')

        t.map(pandas_udf).map(pandas_udf_2).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["4,8", "2,10", "2,28", "2,18", "4,14"])
Ejemplo n.º 9
0
    def test_basic_functionality(self):
        # pandas UDF
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), func_type="pandas")

        # general Python UDF
        subtract_one = udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c'])
        t.where(add_one(t.b) <= 3) \
            .select(t.a, t.b + 1, add(t.a + 1, subtract_one(t.c)) + 2, add(add_one(t.a), 1)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 3, 6, 3]", "+I[3, 2, 14, 5]"])
Ejemplo n.º 10
0
    def test_basic_functionality(self):
        self.t_env.create_temporary_system_function(
            "add_one",
            udf(lambda i: i + 1,
                result_type=DataTypes.BIGINT(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function("add", add)

        # general Python UDF
        self.t_env.create_temporary_system_function(
            "subtract_one", udf(SubtractOne(), result_type=DataTypes.BIGINT()))

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t = t.where("add_one(b) <= 3") \
            .select("a, b + 1, add(a + 1, subtract_one(c)) + 2, add(add_one(a), 1L)")
        result = self.collect(t)
        self.assert_equals(result, ["1,3,6,3", "3,2,14,5"])
Ejemplo n.º 11
0
    def test_udf_without_arguments(self):
        self.t_env.register_function(
            "one",
            udf(lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True))
        self.t_env.register_function(
            "two",
            udf(lambda: 2, result_type=DataTypes.BIGINT(),
                deterministic=False))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select("one(), two()").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "1,2", "1,2"])
Ejemplo n.º 12
0
    def test_set_environment(self):
        if getattr(os, "symlink", None) is None:
            self.skipTest("Symbolic link is not supported, skip testing 'test_set_python_exec'...")

        python_exec = sys.executable
        tmp_dir = self.tempdir
        python_exec_link_path = os.path.join(tmp_dir, "py_exec")
        os.symlink(python_exec, python_exec_link_path)
        self.t_env.get_config().set_python_executable(python_exec_link_path)

        def check_python_exec(i):
            import os
            assert os.environ["python"] == python_exec_link_path
            return i

        self.t_env.register_function("check_python_exec",
                                     udf(check_python_exec, DataTypes.BIGINT(),
                                         DataTypes.BIGINT()))

        def check_pyflink_gateway_disabled(i):
            try:
                from pyflink.java_gateway import get_gateway
                get_gateway()
            except Exception as e:
                assert str(e).startswith("It's launching the PythonGatewayServer during Python UDF"
                                         " execution which is unexpected.")
            else:
                raise Exception("The gateway server is not disabled!")
            return i

        self.t_env.register_function("check_pyflink_gateway_disabled",
                                     udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(),
                                         DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select("check_python_exec(a), check_pyflink_gateway_disabled(a)").insert_into("Results")
        self.t_env.execute("test")

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,2", "3,3"])
Ejemplo n.º 13
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     table = input_list[0]
     t_env.register_function(
         "predict",
         udf(f=PredictFunction(None),
             input_types=[DataTypes.STRING()],
             result_type=DataTypes.STRING()))
     return [table.select('face_id, predict(feature_data) as label')]
Ejemplo n.º 14
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'false')
        # test lambda function
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())

        # test Python ScalarFunction
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        # test callable function
        add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT())

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        add_one_partial = udf(functools.partial(partial_func, param=1),
                              result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f'], [
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT()
            ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        exec_insert_table(
            t.where(add_one(t.b) <= 3).select(add_one(t.a), subtract_one(t.b),
                                              add(t.a, t.c),
                                              add_one_callable(t.a),
                                              add_one_partial(t.a), t.a),
            "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,1,4,2,2,1", "4,0,12,4,4,3"])
Ejemplo n.º 15
0
    def test_open(self):
        self.t_env.get_config().set('python.metric.enabled', 'true')
        execution_mode = self.t_env.get_config().get("python.execution-mode",
                                                     None)

        if execution_mode == "process":
            subtract = udf(SubtractWithMetrics(),
                           result_type=DataTypes.BIGINT())
        else:
            subtract = udf(Subtract(), result_type=DataTypes.BIGINT())

        sink_table_ddl = """
                        CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink')
                        """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b'])
        t.select(t.a, subtract(t.b)).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 1]", "+I[2, 4]", "+I[3, 3]"])
Ejemplo n.º 16
0
    def test_open(self):
        self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true')
        subtract = udf(Subtract(), result_type=DataTypes.BIGINT())
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b'])
        t.select(t.a, subtract(t.b)).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 1]", "+I[2, 4]", "+I[3, 3]"])
Ejemplo n.º 17
0
    def test_deterministic(self):
        add_one = udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT())
        self.assertTrue(add_one._deterministic)

        add_one = udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT(), deterministic=False)
        self.assertFalse(add_one._deterministic)

        subtract_one = udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())
        self.assertTrue(subtract_one._deterministic)

        with self.assertRaises(ValueError, msg="Inconsistent deterministic: False and True"):
            udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT(), deterministic=False)

        self.assertTrue(add._deterministic)

        @udf(input_types=DataTypes.BIGINT(), result_type=DataTypes.BIGINT(), deterministic=False)
        def non_deterministic_udf(i):
            return i

        self.assertFalse(non_deterministic_udf._deterministic)
Ejemplo n.º 18
0
    def test_invalid_udf(self):
        class Plus(object):
            def eval(self, col):
                return col + 1

        with self.assertRaises(
                TypeError,
                msg="Invalid function: not a function or callable (__call__ is not defined)"):
            # test non-callable function
            self.t_env.register_function(
                "non-callable-udf", udf(Plus(), DataTypes.BIGINT(), DataTypes.BIGINT()))
 def process(self,
             execution_context: flink.ExecutionContext,
             input_list: List[Table] = None) -> List[Table]:
     execution_context.table_env.register_function(
         "sleep_func",
         udf(SleepUDF(),
             input_types=[DataTypes.STRING()],
             result_type=DataTypes.STRING()))
     return [
         input_list[0].group_by('word').select('sleep_func(word), count(1)')
     ]
Ejemplo n.º 20
0
    def test_name(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        self.assertEqual("<lambda>", add_one._name)

        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), name="add_one")
        self.assertEqual("add_one", add_one._name)

        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())
        self.assertEqual("SubtractOne", subtract_one._name)

        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT(), name="subtract_one")
        self.assertEqual("subtract_one", subtract_one._name)

        self.assertEqual("add", add._name)

        @udf(result_type=DataTypes.BIGINT(), name="named")
        def named_udf(i):
            return i

        self.assertEqual("named", named_udf._name)
Ejemplo n.º 21
0
    def test_open(self):
        self.t_env.register_function(
            "subtract", udf(Subtract(), DataTypes.BIGINT(), DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b'])
        t.select("a, subtract(b)").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,4", "3,3"])
Ejemplo n.º 22
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     table = input_list[0]
     Popen('rm -rf /root/test', shell=True)
     t_env.register_function(
         "search",
         udf(SearchUDTF3(self.path, self.element_type), DataTypes.STRING(),
             DataTypes.INT()))
     return [
         table.select("face_id, device_id, search(feature_data) as near_id")
     ]
Ejemplo n.º 23
0
    def test_overwrite_builtin_function(self):
        self.t_env.create_temporary_system_function(
            "plus", udf(lambda i, j: i + j - 1,
                        result_type=DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c'])
        exec_insert_table(t.select("plus(a, b)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2", "6", "3"])
Ejemplo n.º 24
0
    def test_open(self):
        self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true')
        self.t_env.create_temporary_system_function(
            "subtract", udf(Subtract(), result_type=DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b'])
        exec_insert_table(t.select("a, subtract(b)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,4", "3,3"])
Ejemplo n.º 25
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'false')
        # test lambda function
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())

        # test Python ScalarFunction
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        # test callable function
        add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT())

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        add_one_partial = udf(functools.partial(partial_func, param=1),
                              result_type=DataTypes.BIGINT())

        # check memory limit is set
        @udf(result_type=DataTypes.BIGINT())
        def check_memory_limit():
            assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None
            return 1

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(),
             DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c'])
        t.where(add_one(t.b) <= 3).select(
            add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a),
            add_one_partial(t.a), check_memory_limit(), t.a) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
Ejemplo n.º 26
0
    def test_scalar_function(self):
        # test lambda function
        self.t_env.register_function(
            "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT()))

        # test Python ScalarFunction
        self.t_env.register_function(
            "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        # test Python function
        self.t_env.register_function("add", add)

        # test callable function
        self.t_env.register_function(
            "add_one_callable", udf(CallablePlus(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        self.t_env.register_function(
            "add_one_partial",
            udf(functools.partial(partial_func, param=1), DataTypes.BIGINT(), DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(),
             DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c'])
        t.where("add_one(b) <= 3") \
            .select("add_one(a), subtract_one(b), add(a, c), add_one_callable(a), "
                    "add_one_partial(a)") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,1,4,2,2", "4,0,12,4,4"])
Ejemplo n.º 27
0
    def test_group_aggregate_function(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c'], [
            DataTypes.TINYINT(),
            DataTypes.FLOAT(),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.INT()),
                DataTypes.FIELD("b", DataTypes.INT())
            ])
        ])
        self.t_env.register_table_sink("Results", table_sink)
        # general udf
        add = udf(lambda a: a + 1, result_type=DataTypes.INT())
        # pandas udf
        substract = udf(lambda a: a - 1,
                        result_type=DataTypes.INT(),
                        func_type="pandas")
        max_udaf = udaf(lambda a: (a.max(), a.min()),
                        result_type=DataTypes.ROW([
                            DataTypes.FIELD("a", DataTypes.INT()),
                            DataTypes.FIELD("b", DataTypes.INT())
                        ]),
                        func_type="pandas")
        t.group_by("a") \
            .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 6.0, +I[5, 2]]", "+I[2, 3.0, +I[3, 2]]",
            "+I[3, 3.0, +I[2, 2]]"
        ])
Ejemplo n.º 28
0
    def test_execute_from_json_plan(self):
        # create source file path
        tmp_dir = self.tempdir
        data = ['1,1', '3,3', '2,2']
        source_path = tmp_dir + '/test_execute_from_json_plan_input.csv'
        sink_path = tmp_dir + '/test_execute_from_json_plan_out'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        source_table = """
            CREATE TABLE source_table (
                a BIGINT,
                b BIGINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = '%s',
                'format' = 'csv'
            )
        """ % source_path
        self.t_env.execute_sql(source_table)

        self.t_env.execute_sql("""
            CREATE TABLE sink_table (
                id BIGINT,
                data BIGINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = '%s',
                'format' = 'csv'
            )
        """ % sink_path)

        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        self.t_env.create_temporary_system_function("add_one", add_one)

        json_plan = self.t_env._j_tenv.getJsonPlan(
            "INSERT INTO sink_table SELECT "
            "a, "
            "add_one(b) "
            "FROM source_table")
        from py4j.java_gateway import get_method
        get_method(self.t_env._j_tenv.executeJsonPlan(json_plan), "await")()

        import glob
        lines = [
            line.strip() for file in glob.glob(sink_path + '/*')
            for line in open(file, 'r')
        ]
        lines.sort()
        self.assertEqual(lines, ['1,2', '2,3', '3,4'])
    def test_add_python_file(self):
        import uuid
        env = self.env
        python_file_dir = os.path.join(self.tempdir,
                                       "python_file_dir_" + str(uuid.uuid4()))
        os.mkdir(python_file_dir)
        python_file_path = os.path.join(python_file_dir, "test_dep1.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_two(a):\n    return a + 2")

        def plus_two_map(value):
            from test_dep1 import add_two
            return add_two(value)

        get_j_env_configuration(env._j_stream_execution_environment).\
            setString("taskmanager.numberOfTaskSlots", "10")
        env.add_python_file(python_file_path)
        ds = env.from_collection([1, 2, 3, 4, 5])
        ds = ds.map(plus_two_map, Types.LONG()) \
               .slot_sharing_group("data_stream") \
               .map(lambda i: i, Types.LONG()) \
               .slot_sharing_group("table")

        python_file_path = os.path.join(python_file_dir, "test_dep2.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_three(a):\n    return a + 3")

        def plus_three(value):
            from test_dep2 import add_three
            return add_three(value)

        t_env = StreamTableEnvironment.create(
            stream_execution_environment=env,
            environment_settings=EnvironmentSettings.in_streaming_mode())
        env.add_python_file(python_file_path)

        from pyflink.table.udf import udf
        from pyflink.table.expressions import col
        add_three = udf(plus_three, result_type=DataTypes.BIGINT())

        tab = t_env.from_data_stream(ds, col('a')) \
                   .select(add_three(col('a')))
        t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \
             .map(lambda i: i[0]) \
             .add_sink(self.test_sink)
        env.execute("test add_python_file")
        result = self.test_sink.get_results(True)
        expected = ['6', '7', '8', '9', '10']
        result.sort()
        expected.sort()
        self.assertEqual(expected, result)
Ejemplo n.º 30
0
    def test_set_requirements_with_cached_directory(self):
        tmp_dir = self.tempdir
        requirements_txt_path = os.path.join(
            tmp_dir, "requirements_txt_" + str(uuid.uuid4()))
        with open(requirements_txt_path, 'w') as f:
            f.write("python-package1==0.0.0")

        requirements_dir_path = os.path.join(
            tmp_dir, "requirements_dir_" + str(uuid.uuid4()))
        os.mkdir(requirements_dir_path)
        package_file_name = "python-package1-0.0.0.tar.gz"
        with open(os.path.join(requirements_dir_path, package_file_name),
                  'wb') as f:
            import base64
            # This base64 data is encoded from a python package file which includes a
            # "python_package1" module. The module contains a "plus(a, b)" function.
            # The base64 can be recomputed by following code:
            # base64.b64encode(open("python-package1-0.0.0.tar.gz", "rb").read()).decode("utf-8")
            f.write(
                base64.b64decode(
                    "H4sICNefrV0C/2Rpc3QvcHl0aG9uLXBhY2thZ2UxLTAuMC4wLnRhcgDtmVtv2jAYhnPtX2H1CrRCY+ckI"
                    "XEx7axuUA11u5imyICTRc1JiVnHfv1MKKWjYxwKEdPehws7xkmUfH5f+3PyqfqWpa1cjG5EKFnLbOvfhX"
                    "FQTI3nOPPSdavS5Pa8nGMwy3Esi3ke9wyTObbnGNQxamBSKlFQavzUryG8ldG6frpbEGx4yNmDLMp/hPy"
                    "P8b+6fNN613vdP1z8XdteG3+ug/17/F3Hcw1qIv5H54NUYiyUaH2SRRllaYeytkl6IpEdujI2yH2XapCQ"
                    "wSRJRDHt0OveZa//uUfeZonUvUO5bHo+0ZcoVo9bMhFRvGx9H41kWj447aUsR0WUq+pui8arWKggK5Jli"
                    "wGOo/95q79ovXi6/nfyf246Dof/n078fT9KI+X77Xx6BP83bX4Xf5NxT7dz7toO/L8OxjKgeTwpG+KcDp"
                    "sdQjWFVJMipYI+o0MCk4X/t2UYtqI0yPabCHb3f861XcD/Ty/+Y5nLdCzT0dSPo/SmbKsf6un+b7KV+Ls"
                    "W4/D/OoC9w/930P9eGwM75//csrD+Q/6P/P/k9D/oX3988Wqw1bS/tf6tR+s/m3EG/ddBqXO9XKf15C8p"
                    "P9k4HZBtBgzZaVW5vrfKcj+W32W82ygEB9D/Xu9+4/qfP9L/rBv0X1v87yONKRX61/qfzwqjIDzIPTbv/"
                    "7or3/88i0H/tfBFW7s/s/avRInQH06ieEy7tDrQeYHUdRN7wP+n/vf62LOH/pld7f9xz7a5Pfufedy0oP"
                    "86iJI8KxStAq6yLC4JWdbbVbWRikR2z1ZGytk5vauW3QdnBFE6XqwmykazCesAAAAAAAAAAAAAAAAAAAA"
                    "AAAAAAAAAAAAAAOBw/AJw5CHBAFAAAA=="))
        self.st_env.set_python_requirements(requirements_txt_path,
                                            requirements_dir_path)

        def add_one(i):
            from python_package1 import plus
            return plus(i, 1)

        self.st_env.create_temporary_system_function(
            "add_one", udf(add_one, DataTypes.BIGINT(), DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.st_env.register_table_sink("Results", table_sink)
        t = self.st_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select(expr.call('add_one', t.a),
                 t.a).execute_insert("Results").wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[2, 1]", "+I[3, 2]", "+I[4, 3]"])