Esempio n. 1
0
    def test_check_result_type(self):
        def pandas_udaf():
            pass

        with self.assertRaises(
                TypeError,
                msg="Invalid returnType: Pandas UDAF doesn't support DataType type MAP currently"):
            udaf(pandas_udaf, result_type=DataTypes.MAP(DataTypes.INT(), DataTypes.INT()),
                 func_type="pandas")
Esempio n. 2
0
 def test_map_view(self):
     my_count = udaf(CountDistinctAggregateFunction())
     self.t_env.get_config().set_idle_state_retention(
         datetime.timedelta(days=1))
     self.t_env.get_config().set("python.fn-execution.bundle.size", "2")
     # trigger the cache eviction in a bundle.
     self.t_env.get_config().set("python.state.cache-size", "1")
     self.t_env.get_config().set("python.map-state.read-cache-size", "1")
     self.t_env.get_config().set("python.map-state.write-cache-size", "1")
     t = self.t_env.from_elements([(1, 'Hi_', 'hi'), (1, 'Hi', 'hi'),
                                   (2, 'hello', 'hello'), (3, 'Hi_', 'hi'),
                                   (3, 'Hi', 'hi'), (4, 'hello', 'hello'),
                                   (5, 'Hi2_', 'hi'), (5, 'Hi2', 'hi'),
                                   (6, 'hello2', 'hello'), (7, 'Hi', 'hi'),
                                   (8, 'hello', 'hello'), (9, 'Hi2', 'hi'),
                                   (13, 'Hi3', 'hi')], ['a', 'b', 'c'])
     self.t_env.create_temporary_view("source", t)
     table_with_retract_message = self.t_env.sql_query(
         "select LAST_VALUE(b) as b, LAST_VALUE(c) as c from source group by a"
     )
     result = table_with_retract_message.group_by(t.c).select(
         my_count(t.b).alias("a"), t.c)
     assert_frame_equal(
         result.to_pandas().sort_values('c').reset_index(drop=True),
         pd.DataFrame([[2, "hello"], [3, "hi"]], columns=['a', 'c']))
Esempio n. 3
0
 def test_map_view_iterate(self):
     test_iterate = udaf(TestIterateAggregateFunction())
     self.t_env.get_config().set_idle_state_retention(
         datetime.timedelta(days=1))
     self.t_env.get_config().set("python.fn-execution.bundle.size", "2")
     # trigger the cache eviction in a bundle.
     self.t_env.get_config().set("python.state.cache-size", "2")
     self.t_env.get_config().set("python.map-state.read-cache-size", "2")
     self.t_env.get_config().set("python.map-state.write-cache-size", "2")
     self.t_env.get_config().set(
         "python.map-state.iterate-response-batch-size", "2")
     t = self.t_env.from_elements([(1, 'Hi_', 'hi'), (1, 'Hi', 'hi'),
                                   (2, 'hello', 'hello'), (3, 'Hi_', 'hi'),
                                   (3, 'Hi', 'hi'), (4, 'hello', 'hello'),
                                   (5, 'Hi2_', 'hi'), (5, 'Hi2', 'hi'),
                                   (6, 'hello2', 'hello'), (7, 'Hi', 'hi'),
                                   (8, 'hello', 'hello'), (9, 'Hi2', 'hi'),
                                   (13, 'Hi3', 'hi')], ['a', 'b', 'c'])
     self.t_env.create_temporary_view("source", t)
     table_with_retract_message = self.t_env.sql_query(
         "select LAST_VALUE(b) as b, LAST_VALUE(c) as c from source group by a"
     )
     result = table_with_retract_message.group_by(t.c) \
         .select(test_iterate(t.b).alias("a"), t.c) \
         .select(col("a").get(0).alias("a"),
                 col("a").get(1).alias("b"),
                 col("a").get(2).alias("c"),
                 col("a").get(3).alias("d"),
                 t.c.alias("e"))
     assert_frame_equal(
         result.to_pandas().sort_values('c').reset_index(drop=True),
         pd.DataFrame(
             [["Hi,Hi2,Hi3", "1,2,3", "Hi:3,Hi2:2,Hi3:1", 3, "hi"],
              ["hello,hello2", "1,3", 'hello:3,hello2:1', 2, "hello"]],
             columns=['a', 'b', 'c', 'd', 'e']))
Esempio n. 4
0
    def test_aggregate_with_pandas_udaf(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.TINYINT(),
             DataTypes.FLOAT(),
             DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda a: (a.mean(), a.max()),
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("a", DataTypes.FLOAT()),
                               DataTypes.FIELD("b", DataTypes.INT())
                           ]),
                           func_type="pandas")
        t.group_by(t.a) \
            .aggregate(pandas_udaf(t.b).alias("c", "d")) \
            .select("a, c, d").execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,5.0,8", "2,2.0,3"])
    def test_aggregate_with_pandas_udaf_without_keys(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.FLOAT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: Row(pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("a", DataTypes.FLOAT()),
                               DataTypes.FIELD("b", DataTypes.INT())
                           ]),
                           func_type="pandas")
        t.select(t.b) \
            .aggregate(pandas_udaf.alias("a", "b")) \
            .select("a, b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[3.8, 8]"])
Esempio n. 6
0
    def test_proc_time_over_rows_window_aggregate_function(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2013-01-01 03:10:00', '3,2,2013-01-01 03:10:00',
            '2,1,2013-01-01 03:10:00', '1,5,2013-01-01 03:10:00',
            '1,8,2013-01-01 04:20:00', '2,3,2013-01-01 03:30:00'
        ]
        source_path = tmp_dir + '/test_over_rows_window_aggregate_function.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        max_add_min_udaf = udaf(lambda a: a.max() + a.min(),
                                result_type=DataTypes.SMALLINT(),
                                func_type='pandas')
        self.env.set_parallelism(1)
        self.env.set_stream_time_characteristic(
            TimeCharacteristic.ProcessingTime)
        self.t_env.register_function("mean_udaf", mean_udaf)
        self.t_env.register_function("max_add_min_udaf", max_add_min_udaf)
        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                proctime as PROCTIME()
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.TINYINT(),
             DataTypes.FLOAT(),
             DataTypes.SMALLINT()])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.execute_sql("""
            insert into Results
            select a,
             mean_udaf(b)
             over (PARTITION BY a ORDER BY proctime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),
             max_add_min_udaf(b)
             over (PARTITION BY a ORDER BY proctime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW)
            from source_table
        """).wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 1.0, 2]", "+I[1, 3.0, 6]", "+I[1, 6.5, 13]",
            "+I[2, 1.0, 2]", "+I[2, 2.0, 4]", "+I[3, 2.0, 4]"
        ])
        os.remove(source_path)
Esempio n. 7
0
    def test_group_aggregate_with_aux_group(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b INT, c FLOAT, d INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true')
        self.t_env.get_config().set('python.metric.enabled', 'true')
        self.t_env.register_function("max_add", udaf(MaxAdd(),
                                                     result_type=DataTypes.INT(),
                                                     func_type="pandas"))
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        t.group_by(t.a) \
            .select(t.a,  (t.a + 1).alias("b"), (t.a + 2).alias("c")) \
            .group_by(t.a, t.b) \
            .select(t.a, t.b, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 2, 2.0, 6]", "+I[2, 3, 3.0, 8]", "+I[3, 4, 4.0, 10]"])
Esempio n. 8
0
 def test_row_time_over_range_window_aggregate_function(self):
     # create source file path
     import tempfile
     import os
     tmp_dir = tempfile.gettempdir()
     data = [
         '1,1,2013-01-01 03:10:00',
         '3,2,2013-01-01 03:10:00',
         '2,1,2013-01-01 03:10:00',
         '1,5,2013-01-01 03:10:00',
         '1,8,2013-01-01 04:20:00',
         '2,3,2013-01-01 03:30:00'
     ]
     source_path = tmp_dir + '/test_over_range_window_aggregate_function.csv'
     with open(source_path, 'w') as fd:
         for ele in data:
             fd.write(ele + '\n')
     max_add_min_udaf = udaf(lambda a: a.max() + a.min(),
                             result_type=DataTypes.SMALLINT(),
                             func_type='pandas')
     self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
     self.t_env.register_function("mean_udaf", mean_udaf)
     self.t_env.register_function("max_add_min_udaf", max_add_min_udaf)
     source_table = """
         create table source_table(
             a TINYINT,
             b SMALLINT,
             rowtime TIMESTAMP(3),
             WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
         ) with(
             'connector.type' = 'filesystem',
             'format.type' = 'csv',
             'connector.path' = '%s',
             'format.ignore-first-line' = 'false',
             'format.field-delimiter' = ','
         )
     """ % source_path
     self.t_env.execute_sql(source_table)
     table_sink = source_sink_utils.TestAppendSink(
         ['a', 'b', 'c'],
         [
             DataTypes.TINYINT(),
             DataTypes.FLOAT(),
             DataTypes.SMALLINT()])
     self.t_env.register_table_sink("Results", table_sink)
     self.t_env.execute_sql("""
         insert into Results
         select a,
          mean_udaf(b)
          over (PARTITION BY a ORDER BY rowtime
          RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW),
          max_add_min_udaf(b)
          over (PARTITION BY a ORDER BY rowtime
          RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW)
         from source_table
     """).wait()
     actual = source_sink_utils.results()
     self.assert_equals(actual,
                        ["1,3.0,6", "1,3.0,6", "1,8.0,16", "2,1.0,2", "2,2.0,4", "3,2.0,4"])
     os.remove(source_path)
Esempio n. 9
0
    def test_group_aggregate_function(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b FLOAT,c ROW<a INT, b INT>) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        # general udf
        add = udf(lambda a: a + 1, result_type=DataTypes.INT())
        # pandas udf
        substract = udf(lambda a: a - 1, result_type=DataTypes.INT(), func_type="pandas")
        max_udaf = udaf(lambda a: (a.max(), a.min()),
                        result_type=DataTypes.ROW(
                            [DataTypes.FIELD("a", DataTypes.INT()),
                             DataTypes.FIELD("b", DataTypes.INT())]),
                        func_type="pandas")
        t.group_by(t.a) \
            .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[1, 6.0, +I[5, 2]]", "+I[2, 3.0, +I[3, 2]]", "+I[3, 3.0, +I[2, 2]]"])
Esempio n. 10
0
    def test_group_aggregate_function(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.TINYINT(), DataTypes.FLOAT(),
             DataTypes.ROW(
                 [DataTypes.FIELD("a", DataTypes.INT()),
                  DataTypes.FIELD("b", DataTypes.INT())])])
        self.t_env.register_table_sink("Results", table_sink)
        # general udf
        add = udf(lambda a: a + 1, result_type=DataTypes.INT())
        # pandas udf
        substract = udf(lambda a: a - 1, result_type=DataTypes.INT(), func_type="pandas")
        max_udaf = udaf(lambda a: (a.max(), a.min()),
                        result_type=DataTypes.ROW(
                            [DataTypes.FIELD("a", DataTypes.INT()),
                             DataTypes.FIELD("b", DataTypes.INT())]),
                        func_type="pandas")
        t.group_by(t.a) \
            .select(t.a, mean_udaf(add(t.b)), max_udaf(substract(t.c))) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[1, 6.0, +I[5, 2]]", "+I[2, 3.0, +I[3, 2]]", "+I[3, 3.0, +I[2, 2]]"])
Esempio n. 11
0
    def test_aggregate_with_pandas_udaf(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b FLOAT, c INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.a.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        t.select(t.a, t.b) \
            .group_by(t.a) \
            .aggregate(pandas_udaf) \
            .select(expr.col("*")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 5.0, 1]", "+I[2, 2.0, 2]"])
Esempio n. 12
0
    def test_group_aggregate_with_aux_group(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [DataTypes.TINYINT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.get_config().set('python.metric.enabled', 'true')
        self.t_env.register_function("max_add", udaf(MaxAdd(),
                                                     result_type=DataTypes.INT(),
                                                     func_type="pandas"))
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        t.group_by(t.a) \
            .select(t.a,  (t.a + 1).alias("b"), (t.a + 2).alias("c")) \
            .group_by(t.a, t.b) \
            .select(t.a, t.b, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 2, 2.0, 6]", "+I[2, 3, 3.0, 8]", "+I[3, 4, 4.0, 10]"])
Esempio n. 13
0
    def test_slide_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Slide
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.register_function("max_add", udaf(MaxAdd(),
                                                     result_type=DataTypes.INT(),
                                                     func_type="pandas"))
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        slide_window = Slide.over(lit(1).hours) \
            .every(lit(30).minutes) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(slide_window) \
            .group_by(t.a, col("w")) \
            .select(t.a,
                    col("w").start,
                    col("w").end,
                    mean_udaf(t.b),
                    call("max_add", t.b, t.c, 1)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 6]",
                            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5, 7]",
                            "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5, 14]",
                            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0, 14]",
                            "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0, 4]",
                            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 10]",
                            "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0, 10]",
                            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 7]",
                            "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 7]"])
Esempio n. 14
0
 def test_using_decorator(self):
     my_count = udaf(CountAggregateFunction(),
                     accumulator_type=DataTypes.ARRAY(DataTypes.INT()),
                     result_type=DataTypes.INT())
     t = self.t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c'])
     result = t.group_by(t.c) \
         .select(my_count(t.a).alias("a"), t.c.alias("b"))
     plan = result.explain()
     result_type = result.get_schema().get_field_data_type(0)
     self.assertTrue(plan.find("PythonGroupAggregate(groupBy=[c], ") >= 0)
     self.assertEqual(result_type, DataTypes.INT())
Esempio n. 15
0
 def process(self) -> None:
   calculate = udaf(CalculateAgg())
   shop_agg = udaf(ShopAgg())
   table = self.table_env.from_path("wish_product_data")
   result_table = table.group_by(table.pid)\
                 .select(calculate(table.timestamp, table.pid, table.merchant_id,
                                   table.merchant_name, table.shop_name, table.review_number,
                                   table.review_score, table.shop_review_number,
                                   table.title, table.is_pb, table.is_hwc, table.is_verified,
                                   table.total_bought, table.total_wishlist, table.tags,
                                   table.category_ids, table.category_paths,
                                   table.category_l1_ids, table.category_l2_ids,
                                   table.category_l3_ids, table.leaf_category_ids,
                                   table.price, table.shipping_price, table.sold, table.update_time,
                                   table.shop_open_time, table.gen_time, table.data_update_time))
                         #calculate(table.timestamp, table.update_time, table.sold, table.price, table.review_number))
   shop_table = table.select(table.update_time, table.shop_name)
   shop_table = shop_table.add_columns("1 as count")
   shop_result_table = shop_table.group_by(shop_table.shop_name)\
       .select(shop_table.shop_name, shop_agg(table.update_time, shop_table.count))
   self.sink_to_kafka(self.wish_result_topic, result_table, self.result_type)
   self.sink_to_kafka(self.wish_shop_result_topic, shop_result_table, self.shop_result_type)
Esempio n. 16
0
 def test_data_view_clear(self):
     my_count = udaf(CountDistinctAggregateFunction())
     self.t_env.get_config().set_idle_state_retention(
         datetime.timedelta(days=1))
     self.t_env.get_config().set("python.fn-execution.bundle.size", "2")
     # trigger the cache eviction in a bundle.
     self.t_env.get_config().set("python.state.cache-size", "1")
     t = self.t_env.from_elements([(2, 'hello', 'hello'),
                                   (4, 'clear', 'hello'),
                                   (6, 'hello2', 'hello'),
                                   (8, 'hello', 'hello')], ['a', 'b', 'c'])
     result = t.group_by(t.c).select(my_count(t.b).alias("a"), t.c)
     assert_frame_equal(result.to_pandas(),
                        pd.DataFrame([[2, "hello"]], columns=['a', 'c']))
Esempio n. 17
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by("w") \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select("w.rowtime, d, e") \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["2018-03-11 03:59:59.999,2.2,3",
                            "2018-03-11 04:59:59.999,8.0,8"])
Esempio n. 18
0
 def test_list_view(self):
     my_concat = udaf(ListViewConcatAggregateFunction())
     self.t_env.get_config().set("python.fn-execution.bundle.size", "2")
     # trigger the cache eviction in a bundle.
     self.t_env.get_config().set("python.state.cache-size", "2")
     t = self.t_env.from_elements([(1, 'Hi', 'Hello'), (3, 'Hi', 'hi'),
                                   (3, 'Hi2', 'hi'), (3, 'Hi', 'hi'),
                                   (2, 'Hi', 'Hello'), (1, 'Hi2', 'Hello'),
                                   (3, 'Hi3', 'hi'), (3, 'Hi2', 'Hello'),
                                   (3, 'Hi3', 'hi'), (2, 'Hi3', 'Hello')],
                                  ['a', 'b', 'c'])
     result = t.group_by(t.c).select(my_concat(t.b, ',').alias("a"), t.c)
     assert_frame_equal(
         result.to_pandas().sort_values('c').reset_index(drop=True),
         pd.DataFrame(
             [["Hi,Hi,Hi2,Hi2,Hi3", "Hello"], ["Hi,Hi2,Hi,Hi3,Hi3", "hi"]],
             columns=['a', 'c']))
Esempio n. 19
0
 def test_data_view(self):
     my_concat = udaf(ConcatAggregateFunction())
     self.t_env.get_config().get_configuration().set_string(
         "python.fn-execution.bundle.size", "2")
     # trigger the cache eviction in a bundle.
     self.t_env.get_config().get_configuration().set_string(
         "python.state.cache.size", "2")
     t = self.t_env.from_elements([(1, 'Hi', 'Hello'),
                                   (3, 'Hi', 'hi'),
                                   (3, 'Hi2', 'hi'),
                                   (3, 'Hi', 'hi2'),
                                   (2, 'Hi', 'Hello')], ['a', 'b', 'c'])
     result = t.group_by(t.c).select(my_concat(t.b, ',').alias("a"), t.c)
     assert_frame_equal(result.to_pandas(),
                        pd.DataFrame([["Hi,Hi2", "hi"],
                                      ["Hi", "hi2"],
                                      ["Hi,Hi", "Hello"]], columns=['a', 'c']))
Esempio n. 20
0
    def test_group_aggregate_without_keys(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a'],
            [DataTypes.INT()])
        min_add = udaf(lambda a, b, c: a.min() + b.min() + c.min(),
                       result_type=DataTypes.INT(), func_type="pandas")
        self.t_env.register_table_sink("Results", table_sink)
        t.select(min_add(t.a, t.b, t.c)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[5]"])
Esempio n. 21
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        print(t.get_schema())
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by(expr.col("w")) \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11 03:59:59.999, 2.2, 3]",
                            "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
Esempio n. 22
0
    def test_group_aggregate_without_keys(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        min_add = udaf(lambda a, b, c: a.min() + b.min() + c.min(),
                       result_type=DataTypes.INT(), func_type="pandas")
        t.select(min_add(t.a, t.b, t.c)) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[5]"])
Esempio n. 23
0
    def test_aggregate(self):
        import pandas as pd
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.BIGINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        function = CountAndSumAggregateFunction()
        agg = udaf(function,
                   result_type=function.get_result_type(),
                   accumulator_type=function.get_accumulator_type(),
                   name=str(function.__class__.__name__))
        result = t.group_by(t.a) \
            .aggregate(agg.alias("c", "d")) \
            .select("a, c, d") \
            .to_pandas()
        assert_frame_equal(result, pd.DataFrame([[1, 3, 15], [2, 2, 4]], columns=['a', 'c', 'd']))
Esempio n. 24
0
    def test_aggregate_with_pandas_udaf_without_keys(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a FLOAT, b INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        pandas_udaf = udaf(lambda pd: Row(pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        t.select(t.b) \
            .aggregate(pandas_udaf.alias("a", "b")) \
            .select(t.a, t.b) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[3.8, 8]"])
Esempio n. 25
0
    def test_execute_over_aggregate_from_json_plan(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2013-01-01 03:10:00',
            '3,2,2013-01-01 03:10:00',
            '2,1,2013-01-01 03:10:00',
            '1,5,2013-01-01 03:10:00',
            '1,8,2013-01-01 04:20:00',
            '2,3,2013-01-01 03:30:00'
        ]
        source_path = tmp_dir + '/test_execute_over_aggregate_from_json_plan.csv'
        sink_path = tmp_dir + '/test_execute_over_aggregate_from_json_plan'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        source_table = """
            CREATE TABLE source_table (
                a TINYINT,
                b SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) WITH (
                'connector' = 'filesystem',
                'path' = '%s',
                'format' = 'csv'
            )
        """ % source_path
        self.t_env.execute_sql(source_table)

        self.t_env.execute_sql("""
            CREATE TABLE sink_table (
                a TINYINT,
                b FLOAT,
                c SMALLINT
            ) WITH (
                'connector' = 'filesystem',
                'path' = '%s',
                'format' = 'csv'
            )
        """ % sink_path)

        max_add_min_udaf = udaf(lambda a: a.max() + a.min(),
                                result_type=DataTypes.SMALLINT(),
                                func_type='pandas')
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        self.t_env.create_temporary_system_function("max_add_min_udaf", max_add_min_udaf)

        json_plan = self.t_env._j_tenv.compilePlanSql("""
        insert into sink_table
            select a,
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),
             max_add_min_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW)
            from source_table
        """)
        from py4j.java_gateway import get_method
        get_method(self.t_env._j_tenv.executePlan(json_plan), "await")()

        import glob
        lines = [line.strip() for file in glob.glob(sink_path + '/*') for line in open(file, 'r')]
        lines.sort()
        self.assertEqual(lines, ['1,1.0,2', '1,3.0,6', '1,6.5,13', '2,1.0,2', '2,2.0,4', '3,2.0,4'])
Esempio n. 26
0
    def test_over_window_aggregate_function(self):
        import datetime
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'],
            [DataTypes.TINYINT(), DataTypes.FLOAT(), DataTypes.INT(), DataTypes.FLOAT(),
             DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(), DataTypes.FLOAT(),
             DataTypes.FLOAT(), DataTypes.FLOAT()])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        self.t_env.register_function("max_add", udaf(MaxAdd(),
                                                     result_type=DataTypes.INT(),
                                                     func_type="pandas"))
        self.t_env.register_table("T", t)
        self.t_env.execute_sql("""
            insert into Results
            select a,
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN UNBOUNDED preceding AND UNBOUNDED FOLLOWING),
             max_add(b, c)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN UNBOUNDED preceding AND 0 FOLLOWING),
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND 0 FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND UNBOUNDED FOLLOWING),
             mean_udaf(c)
             over (PARTITION BY a ORDER BY rowtime
             RANGE BETWEEN INTERVAL '20' MINUTE PRECEDING AND CURRENT ROW)
            from T
        """).wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 4.3333335, 5, 4.3333335, 3.0, 3.0, 2.5, 4.3333335, 3.0, 2.0]",
                            "+I[1, 4.3333335, 13, 5.5, 3.0, 3.0, 4.3333335, 8.0, 5.0, 5.0]",
                            "+I[1, 4.3333335, 6, 4.3333335, 2.0, 3.0, 2.5, 4.3333335, 3.0, 2.0]",
                            "+I[2, 2.0, 9, 2.0, 4.0, 4.0, 2.0, 2.0, 4.0, 4.0]",
                            "+I[2, 2.0, 3, 2.0, 2.0, 4.0, 1.0, 2.0, 4.0, 2.0]",
                            "+I[3, 2.0, 3, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0]"])
Esempio n. 27
0
    def test_row_time_over_rows_window_aggregate_function(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2013-01-01 03:10:00',
            '3,2,2013-01-01 03:10:00',
            '2,1,2013-01-01 03:10:00',
            '1,5,2013-01-01 03:10:00',
            '1,8,2013-01-01 04:20:00',
            '2,3,2013-01-01 03:30:00'
        ]
        source_path = tmp_dir + '/test_over_rows_window_aggregate_function.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        max_add_min_udaf = udaf(lambda a: a.max() + a.min(),
                                result_type=DataTypes.SMALLINT(),
                                func_type='pandas')
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.register_function("mean_udaf", mean_udaf)
        self.t_env.register_function("max_add_min_udaf", max_add_min_udaf)
        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b FLOAT, c SMALLINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        self.t_env.execute_sql("""
            insert into Results
            select a,
             mean_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW),
             max_add_min_udaf(b)
             over (PARTITION BY a ORDER BY rowtime
             ROWS BETWEEN 1 PRECEDING AND CURRENT ROW)
            from source_table
        """).wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 1.0, 2]",
                            "+I[1, 3.0, 6]",
                            "+I[1, 6.5, 13]",
                            "+I[2, 1.0, 2]",
                            "+I[2, 2.0, 4]",
                            "+I[3, 2.0, 4]"])
        os.remove(source_path)
Esempio n. 28
0
def row_operations():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())

    # define the source
    table = t_env.from_elements(elements=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                                schema=['id', 'data'])

    # map operation
    @udf(result_type=DataTypes.ROW([
        DataTypes.FIELD("id", DataTypes.BIGINT()),
        DataTypes.FIELD("country", DataTypes.STRING())
    ]))
    def extract_country(input_row: Row):
        data = json.loads(input_row.data)
        return Row(input_row.id, data['addr']['country'])

    table.map(extract_country) \
         .execute().print()
    # +----+----------------------+--------------------------------+
    # | op |                   id |                        country |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 |                        Germany |
    # | +I |                    2 |                          China |
    # | +I |                    3 |                          China |
    # | +I |                    4 |                          China |
    # +----+----------------------+--------------------------------+

    # flat_map operation
    @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()])
    def extract_city(input_row: Row):
        data = json.loads(input_row.data)
        yield input_row.id, data['addr']['city']

    table.flat_map(extract_city) \
         .execute().print()

    # +----+----------------------+--------------------------------+
    # | op |                   f0 |                             f1 |
    # +----+----------------------+--------------------------------+
    # | +I |                    1 |                         Berlin |
    # | +I |                    2 |                       Shanghai |
    # | +I |                    3 |                        NewYork |
    # | +I |                    4 |                       Hangzhou |
    # +----+----------------------+--------------------------------+

    # aggregate operation
    class CountAndSumAggregateFunction(AggregateFunction):
        def get_value(self, accumulator):
            return Row(accumulator[0], accumulator[1])

        def create_accumulator(self):
            return Row(0, 0)

        def accumulate(self, accumulator, input_row):
            accumulator[0] += 1
            accumulator[1] += int(input_row.tel)

        def retract(self, accumulator, input_row):
            accumulator[0] -= 1
            accumulator[1] -= int(input_row.tel)

        def merge(self, accumulator, accumulators):
            for other_acc in accumulators:
                accumulator[0] += other_acc[0]
                accumulator[1] += other_acc[1]

        def get_accumulator_type(self):
            return DataTypes.ROW([
                DataTypes.FIELD("cnt", DataTypes.BIGINT()),
                DataTypes.FIELD("sum", DataTypes.BIGINT())
            ])

        def get_result_type(self):
            return DataTypes.ROW([
                DataTypes.FIELD("cnt", DataTypes.BIGINT()),
                DataTypes.FIELD("sum", DataTypes.BIGINT())
            ])

    count_sum = udaf(CountAndSumAggregateFunction())
    table.add_columns(
            col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
            col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
            col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
         .group_by(col('country')) \
         .aggregate(count_sum.alias("cnt", "sum")) \
         .select(col('country'), col('cnt'), col('sum')) \
         .execute().print()

    # +----+--------------------------------+----------------------+----------------------+
    # | op |                        country |                  cnt |                  sum |
    # +----+--------------------------------+----------------------+----------------------+
    # | +I |                          China |                    3 |                  291 |
    # | +I |                        Germany |                    1 |                  123 |
    # +----+--------------------------------+----------------------+----------------------+

    # flat_aggregate operation
    class Top2(TableAggregateFunction):
        def emit_value(self, accumulator):
            for v in accumulator:
                if v:
                    yield Row(v)

        def create_accumulator(self):
            return [None, None]

        def accumulate(self, accumulator, input_row):
            tel = int(input_row.tel)
            if accumulator[0] is None or tel > accumulator[0]:
                accumulator[1] = accumulator[0]
                accumulator[0] = tel
            elif accumulator[1] is None or tel > accumulator[1]:
                accumulator[1] = tel

        def get_accumulator_type(self):
            return DataTypes.ARRAY(DataTypes.BIGINT())

        def get_result_type(self):
            return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())])

    top2 = udtaf(Top2())
    table.add_columns(
            col('data').json_value('$.name', DataTypes.STRING()).alias('name'),
            col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'),
            col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \
        .group_by(col('country')) \
        .flat_aggregate(top2) \
        .select(col('country'), col('tel')) \
        .execute().print()
Esempio n. 29
0
from pyflink.table import AggregateFunction, DataTypes
from pyflink.table.udf import udaf


class WeightedAvg(AggregateFunction):
    def create_accumulator(self):
        # Row(sum, count)
        return Row(0, 0)

    def get_value(self, accumulator: Row) -> float:
        if accumulator[1] == 0:
            return 0
        else:
            return accumulator[0] / accumulator[1]

    def accumulate(self, accumulator: Row, value, weight):
        accumulator[0] += value * weight
        accumulator[1] += weight

    def retract(self, accumulator: Row, value, weight):
        accumulator[0] -= value * weight
        accumulator[1] -= weight


weighted_avg = udaf(f=WeightedAvg(),
                    result_type=DataTypes.DOUBLE(),
                    accumulator_type=DataTypes.ROW([
                        DataTypes.FIELD("f0", DataTypes.BIGINT()),
                        DataTypes.FIELD("f1", DataTypes.BIGINT())
                    ]))