Ejemplo n.º 1
0
    def test_table_function(self):
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.register_function(
            "multi_emit",
            udtf(MultiEmit(),
                 [DataTypes.BIGINT(), DataTypes.BIGINT()],
                 [DataTypes.BIGINT(), DataTypes.BIGINT()]))

        self.t_env.register_function("condition_multi_emit",
                                     condition_multi_emit)

        self.t_env.register_function(
            "multi_num",
            udf(MultiNum(), [DataTypes.BIGINT()], DataTypes.BIGINT()))

        t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)],
                                     ['a', 'b', 'c'])
        t.join_lateral("multi_emit(a, multi_num(b)) as (x, y)") \
            .left_outer_join_lateral("condition_multi_emit(x, y) as m") \
            .select("x, y, m") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,0,null", "1,1,null", "2,0,null", "2,1,null", "3,0,0", "3,0,1",
            "3,0,2", "3,1,1", "3,1,2", "3,2,2", "3,3,null"
        ])
Ejemplo n.º 2
0
    def test_table_function(self):
        self._register_table_sink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.BIGINT()])

        multi_emit = udtf(
            MultiEmit(), result_types=[DataTypes.BIGINT(),
                                       DataTypes.BIGINT()])
        multi_num = udf(MultiNum(), result_type=DataTypes.BIGINT())

        t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)],
                                     ['a', 'b', 'c'])
        t = t.join_lateral(
            multi_emit((t.a + t.a) / 2, multi_num(t.b)).alias('x', 'y'))
        t = t.left_outer_join_lateral(condition_multi_emit(t.x, t.y).alias('m')) \
            .select("x, y, m")
        t = t.left_outer_join_lateral(identity(t.m).alias('n')) \
            .select("x, y, n")
        actual = self._get_output(t)
        self.assert_equals(actual, [
            "+I[1, 0, null]", "+I[1, 1, null]", "+I[2, 0, null]",
            "+I[2, 1, null]", "+I[3, 0, 0]", "+I[3, 0, 1]", "+I[3, 0, 2]",
            "+I[3, 1, 1]", "+I[3, 1, 2]", "+I[3, 2, 2]", "+I[3, 3, null]"
        ])
Ejemplo n.º 3
0
    def test_table_function(self):
        self._register_table_sink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.BIGINT()])

        self.t_env.register_function(
            "multi_emit",
            udtf(MultiEmit(),
                 result_types=[DataTypes.BIGINT(),
                               DataTypes.BIGINT()]))

        self.t_env.register_function("condition_multi_emit",
                                     condition_multi_emit)

        self.t_env.register_function(
            "multi_num", udf(MultiNum(), result_type=DataTypes.BIGINT()))

        t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)],
                                     ['a', 'b', 'c'])
        t = t.join_lateral("multi_emit(a, multi_num(b)) as (x, y)") \
            .left_outer_join_lateral("condition_multi_emit(x, y) as m") \
            .select("x, y, m")
        actual = self._get_output(t)
        self.assert_equals(actual, [
            "1,0,null", "1,1,null", "2,0,null", "2,1,null", "3,0,0", "3,0,1",
            "3,0,2", "3,1,1", "3,1,2", "3,2,2", "3,3,null"
        ])
Ejemplo n.º 4
0
    def test_execute_from_json_plan(self):
        # create source file path
        tmp_dir = self.tempdir
        data = ['1,1', '3,2', '2,1']
        source_path = tmp_dir + '/test_execute_from_json_plan_input.csv'
        sink_path = tmp_dir + '/test_execute_from_json_plan_out'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        source_table = """
            CREATE TABLE source_table (
                a BIGINT,
                b BIGINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = '%s',
                'format' = 'csv'
            )
        """ % source_path
        self.t_env.execute_sql(source_table)

        self.t_env.execute_sql("""
            CREATE TABLE sink_table (
                a BIGINT,
                b BIGINT,
                c BIGINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = '%s',
                'format' = 'csv'
            )
        """ % sink_path)

        self.t_env.create_temporary_system_function(
            "multi_emit",
            udtf(MultiEmit(),
                 result_types=[DataTypes.BIGINT(),
                               DataTypes.BIGINT()]))

        json_plan = self.t_env._j_tenv.compilePlanSql(
            "INSERT INTO sink_table "
            "SELECT a, x, y FROM source_table "
            "LEFT JOIN LATERAL TABLE(multi_emit(a, b))"
            " as T(x, y)"
            " ON TRUE")
        from py4j.java_gateway import get_method
        get_method(self.t_env._j_tenv.executePlan(json_plan), "await")()

        import glob
        lines = [
            line.strip() for file in glob.glob(sink_path + '/*')
            for line in open(file, 'r')
        ]
        lines.sort()
        self.assertEqual(lines, ['1,1,0', '2,2,0', '3,3,0', '3,3,1'])
Ejemplo n.º 5
0
    def test_row_type_as_input_types_and_result_types(self):
        # test input_types and result_types are DataTypes.ROW
        a = udtf(lambda i: i,
                 input_types=DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())]),
                 result_types=DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())]))

        self.assertEqual(a._input_types,
                         [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])])
        self.assertEqual(a._result_types,
                         [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])])
Ejemplo n.º 6
0
 def execute(self, function_context: FlinkFunctionContext,
             input_list: List[Table]) -> List[Table]:
     t_env = function_context.get_table_env()
     table = input_list[0]
     t_env.register_function(
         "search",
         udtf(SearchUDTF1(self.path, self.element_type), DataTypes.STRING(),
              DataTypes.STRING()))
     return [
         table.join_lateral("search(feature_data) as near_id").select(
             "face_id, near_id")
     ]
    def transform(self, *inputs: Table) -> List[Table]:
        table = inputs[0]
        row_data_type = table.get_schema().to_row_data_type()
        field_types = row_data_type.field_types()
        field_names = row_data_type.field_names()

        table = table.flat_map(
            udtf(f=Predict(os.path.join(self.path, "model_data"),
                           self.predict_col_names),
                 result_types=field_types + self.predict_data_types))
        table = table.alias(*field_names, *self.predict_col_names)
        return [table]
Ejemplo n.º 8
0
    def test_table_function_with_sql_query(self):
        self._register_table_sink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()])

        self.t_env.create_temporary_system_function(
            "multi_emit", udtf(MultiEmit(), result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()]))

        t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)], ['a', 'b', 'c'])
        self.t_env.register_table("MyTable", t)
        t = self.t_env.sql_query(
            "SELECT a, x, y FROM MyTable LEFT JOIN LATERAL TABLE(multi_emit(a, b)) as T(x, y)"
            " ON TRUE")
        actual = self._get_output(t)
        self.assert_equals(actual, ["1,1,0", "2,2,0", "3,3,0", "3,3,1"])
Ejemplo n.º 9
0
    def test_table_function_with_sql_query(self):
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.register_function(
            "multi_emit",
            udtf(MultiEmit(),
                 [DataTypes.BIGINT(), DataTypes.BIGINT()],
                 [DataTypes.BIGINT(), DataTypes.BIGINT()]))

        t = self.t_env.from_elements([(1, 1, 3), (2, 1, 6), (3, 2, 9)],
                                     ['a', 'b', 'c'])
        self.t_env.register_table("MyTable", t)
        self.t_env.sql_query(
            "SELECT a, x, y FROM MyTable LEFT JOIN LATERAL TABLE(multi_emit(a, b)) as T(x, y)"
            " ON TRUE") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1,0", "2,2,0", "3,3,0", "3,3,1"])
Ejemplo n.º 10
0
          word STRING,
          length INT
        ) WITH (
          'connector' = 'filesystem',
          'format' = 'csv',
          'path' = './output/udtf.output2'
        )
    """)

my_table = table_env.from_path("mySource")
# configure the off-heap memory of current taskmanager to enable the python worker uses off-heap memory.
table_env.get_config().get_configuration().set_string(
    "taskmanager.memory.task.off-heap.size", '80m')

# register the Python Table Function
table_env.register_function(
    "split",
    udtf(Split(), DataTypes.STRING(),
         [DataTypes.STRING(), DataTypes.INT()]))

# use the Python Table Function in Python Table API
# my_table.join_lateral("split(a) as (word, length)").insert_into("mySink1")
my_table.join_lateral("split(a) as (word, length)").execute_insert(
    "mySink1").get_job_client().get_job_execution_result().result()

# my_table.left_outer_join_lateral("split(a) as (word, length)").execute_insert(
#     "mySink2").get_job_client().get_job_execution_result().result()
# my_table.left_outer_join_lateral("split(a) as (word, length)")

# table_env.execute("udtf job")