def main(): env = StreamExecutionEnvironment.get_execution_environment() env.add_jars("file:///app/src/kafka-clients-2.8.0.jar") env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar") env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE) config = env.get_checkpoint_config() config.enable_externalized_checkpoints( ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION) st_env = StreamTableEnvironment.create( env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) print("register kafka source") register_kafka_source(st_env) print("register transaction sinks") register_transactions_sink_into_csv(st_env) st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select("""count(message) as total, w.end as end_time """) \ .insert_into("total_sink") st_env.from_path("source_tbl") \ .where("message = 'dolorem'") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select(""" count(message) as total, w.end as end_time """) \ .insert_into("grep_sink") st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w"), col("message")) \ .select(""" count(message) as total, message, w.end as end_time """) \ .insert_into("topk_sink") st_env.execute("app")
def test_sliding_group_window_over_proctime(self): self.t_env.get_config().set("parallelism.default", "1") from pyflink.table.window import Slide self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a INT, proctime as PROCTIME() ) with( 'connector' = 'datagen', 'rows-per-second' = '1', 'fields.a.kind' = 'sequence', 'fields.a.start' = '1', 'fields.a.end' = '10' ) """ self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") iterator = t.select(t.a, t.proctime) \ .window(Slide.over(lit(1).seconds) .every(lit(1).seconds) .on(t.proctime) .alias("w")) \ .group_by(t.a, col("w")) \ .select(mean_udaf(t.a).alias("b"), col("w").start).execute().collect() result = [i for i in iterator] # if the WindowAssigner.isEventTime() does not return false, # the w.start would be 1970-01-01 # TODO: After fixing the TimeZone problem of window with processing time (will be fixed in # FLIP-162), we should replace it with a more accurate assertion. self.assertTrue(result[0][1].year > 1970)
def popular_taxi_vendor(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) taxi_ride = t_env.from_path('TaxiRide') popular_rides = taxi_ride.select(taxi_ride.vendorId, taxi_ride.pickupTime) \ .window(Slide.over('15.minutes').every('5.minutes').on(taxi_ride.pickupTime).alias('w')) \ .group_by(taxi_ride.vendorId, col('w')) \ .select(taxi_ride.vendorId, \ col('w').start.alias('start'), \ col('w').end.alias('end'), \ taxi_ride.vendorId.count.alias('cnt')) t_env.to_append_stream( popular_rides, Types.ROW_NAMED(['vendorId', 'start', 'end', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Taxi-Vendor')
def test_sliding_group_window_over_count(self): self.t_env.get_config().set("parallelism.default", "1") # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Slide self.t_env.get_config().set( "pipeline.time-characteristic", "ProcessingTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over(row_interval(2)) .every(row_interval(1)) .on(t.protime) .alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2.5]", "+I[1, 5.5]", "+I[2, 2.0]", "+I[3, 2.5]"]) os.remove(source_path)
def test_slide_window(self): t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"]) result = t.window(Slide.over(expr.lit(2).seconds).every(expr.lit(1).seconds).on("a") .alias("w")).group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('SlideWindow(field: [a], slide: [1000], size: [2000])', query_operation.getGroupWindow().asSummaryString())
def test_slide_group_window_aggregate_function(self): import datetime from pyflink.table.window import Slide t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) slide_window = Slide.over(lit(1).hours) \ .every(lit(30).minutes) \ .on(col("rowtime")) \ .alias("w") t.window(slide_window) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 6]", "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5, 7]", "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5, 14]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0, 14]", "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0, 4]", "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 10]", "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0, 10]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 7]", "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 7]"])
def perform_sliding_window_aggregation(input_table_name): # use SQL Table in the Table API input_table = table_env.from_path(input_table_name) sliding_window_table = (input_table.window( Slide.over("10.seconds").every("5.seconds").on("event_time").alias( "ten_second_window") ).group_by("ticker, ten_second_window").select( "ticker, price.min as price, ten_second_window.end as event_time")) return sliding_window_table
def perform_sliding_window_aggregation(input_table_name): # use SQL Table in the Table API input_table = table_env.from_path(input_table_name) sliding_window_table = (input_table.window( Slide.over("10.seconds").every("5.seconds").on("EVENT_TIME").alias( "ten_second_window") ).group_by("TICKER, ten_second_window").select( "TICKER, PRICE.min as PRICE, ten_second_window.end as EVENT_TIME")) return sliding_window_table
def test_slide_window(self): t = self.t_env.from_elements([(1000, 1, "Hello"), (2000, 2, "Hello"), (3000, 4, "Hello"), (4000, 8, "Hello")], ["a", "b", "c"]) result = t.window(Slide.over("2.seconds").every("1.seconds").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['1', '3', '6', '12', '8'] self.assert_equals(actual, expected)
def test_sliding_group_window_over_count(self): self.t_env.get_config().get_configuration().set_string("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over("2.rows").every("1.rows").on("protime").alias("w")) \ .group_by("a, w") \ .select("a, my_sum(c) as b") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
def sliding_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts", "name", "price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) # define the sliding window operation table = table.window(Slide.over(lit(5).seconds).every(lit(2).seconds).on(col("ts")).alias("w"))\ .group_by(col('name'), col('w')) \ .select(col('name'), col('price').sum, col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def test_slide_window(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.INT(), DataTypes.STRING()] data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (3000, 4, "Hello"), (4000, 8, "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") result = source.window(Slide.over("2.seconds").every("1.seconds").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['1', '3', '6', '12', '8'] self.assert_equals(actual, expected)
def test_slide_window(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.LONG, DataTypes.INT, DataTypes.STRING] data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (3000, 4, "Hello"), (4000, 8, "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") result = source.window(Slide.over("2.seconds").every("1.seconds").on("a").alias("w"))\ .group_by("w, c").select("b.sum") actual = self.collect(result) expected = ['1', '3', '6', '12', '8'] self.assert_equals(actual, expected)
def test_sliding_group_window_over_count(self): self.t_env.get_config().set("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, d BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Slide.over(row_interval(2)).every(row_interval(1)).on(t.protime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, call("my_sum", t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
def slide_time_window_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_slide_time_window_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \ .group_by("w").select("b.sum") result.insert_into("result") bt_env.execute("slide time window batch")
def slide_row_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/slide_row_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("proctime", DataTypes.TIMESTAMP()) .proctime() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Slide.over("2.rows").every("1.rows").on("proctime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("slide row window streaming")
def log_processing(): env_settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() t_env = StreamTableEnvironment.create(environment_settings=env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar" ) source_ddl = """ CREATE TABLE source_table( token VARCHAR, stime BIGINT, appKey VARCHAR, user_action_time AS PROCTIME() ) WITH ( 'connector' = 'kafka', 'topic' = 'markTopic', 'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092', 'properties.group.id' = 'test_3', 'scan.startup.mode' = 'earliest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( token VARCHAR, appKey VARCHAR, stime TIMESTAMP(3) NOT NULL, nums BIGINT NOT NULL ) WITH ( 'connector' = 'kafka', 'topic' = 'markTopic1', 'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) query_sql = """ SELECT token, appKey, TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, COUNT(token) as nums FROM source_table WHERE appKey = 'YSHAppAndroidIOSH5' GROUP BY token, appKey, TUMBLE(user_action_time, INTERVAL '5' MINUTE) """ # t_env.sql_query(query_sql) \ # .execute_insert("sink_table").wait() source_t = t_env.from_path("source_table") result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \ .window(Slide.over(lit(1).days) \ .every(lit(1).minutes) \ .on(source_t.user_action_time).alias("w")) \ .group_by(source_t.token, source_t.appKey, col("w")) \ .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums")) result.execute_insert("sink_table").wait()
def test_sliding_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Slide self.t_env.get_config().set( "pipeline.time-characteristic", "EventTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over(lit(1).hours) .every(lit(30).minutes) .on(col("rowtime")) .alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0]", "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5]", "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0]", "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0]", "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0]", "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0]", "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0]"]) os.remove(source_path)
def test_sliding_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:30:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', ] source_path = tmp_dir + '/test_sliding_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.create_temporary_system_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.BIGINT() ]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over(lit(1).hours) .every(lit(30).minutes) .on(t.rowtime) .alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, call("my_sum", t.c).alias("c")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2]", "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1]", "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2]", "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 5]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]", "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2]", "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 1]", "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 11]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8]" ])
:param name: :param sex: :param action: :param is_delete: :return: """ names = name[sex == 'female'] return names.value_counts().iloc[:10].to_json() # 注册 udaf t_env.create_temporary_function('male_click_top10', male_click_top10) t_env.create_temporary_function('female_click_top10', female_click_top10) # ########################### 流处理任务 ########################### slide_window = Slide.over("60.seconds").every("1.seconds").on('ts').alias( "w") # 滑动 # 基于 Table API t_env.from_path('source') \ .filter("action = 'click' and is_delete = 1 ") \ .window(slide_window) \ .group_by("w") \ .select("male_click_top10(name, sex) AS male_top10, " "female_click_top10(name, sex) AS female_top10, " "w.start AS start_time, " "w.end AS end_time") \ .execute_insert("sink") t_env.execute(source_topic)