def pandas_udaf(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() # define the tumble window operation table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, mean_udaf(table.price), col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def tumble_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts", "name", "price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .build()) .build()) # define the over window operation table = table.over_window( Over.partition_by(col("name")) .order_by(col("ts")) .preceding(row_interval(2)) .following(CURRENT_ROW) .alias('w')) \ .select(table.name, table.price.max.over(col('w'))) # submit for execution table.execute_insert('sink') \ .wait()
def word_count(input_path, output_path): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # write all the data to one file t_env.get_config().get_configuration().set_string("parallelism.default", "1") # define the source if input_path is not None: t_env.create_temporary_table( 'source', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column( 'word', DataTypes.STRING()).build()).option( 'path', input_path).format('csv').build()) tab = t_env.from_path('source') else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") tab = t_env.from_elements( map(lambda i: (i, ), word_count_data), DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())])) # define the sink if output_path is not None: t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).option('path', output_path). format(FormatDescriptor.for_format('canal-json').build()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).build()) @udtf(result_types=[DataTypes.STRING()]) def split(line: Row): for s in line[0].split(): yield Row(s) # compute word count tab.flat_map(split).alias('word') \ .group_by(col('word')) \ .select(col('word'), lit(1).count) \ .execute_insert('sink') \ .wait()
def process_json_data(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) table = table.select( col('id'), col('data').json_value('$.addr.country', DataTypes.STRING())) # execute table.execute_insert('sink') \ .wait()
def mixing_use_of_datastream_and_table(): # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source t_env.create_temporary_table( 'source', TableDescriptor.for_connector('datagen').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).option("number-of-rows", "10").build()) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('a', DataTypes.BIGINT()).build()).build()) @udf(result_type=DataTypes.BIGINT()) def length(data): return len(data) # perform table api operations table = t_env.from_path("source") table = table.select(col('id'), length(col('data'))) # convert table to datastream and perform datastream api operations ds = t_env.to_data_stream(table) ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG()) # convert datastream to table and perform table api operations as you want table = t_env.from_data_stream( ds, Schema.new_builder().column("f0", DataTypes.BIGINT()).build()) # execute table.execute_insert('sink') \ .wait()
def setUp(self) -> None: self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(1) self.env.add_jars("file://{}".format(find_jar_path())) self.t_env = StreamTableEnvironment.create(self.env) self.source_table = self.t_env.from_descriptor( TableDescriptor.for_connector("datagen").schema( Schema.new_builder().column("x", DataTypes.INT()).column( "a", DataTypes.INT()).build()).option( "fields.x.kind", "sequence").option("fields.x.start", "1").option( "fields.x.end", "100").option("fields.a.kind", "sequence").option( "fields.a.start", "101").option("fields.a.end", "200").build())
def process_json_data_with_udf(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) # update json columns @udf(result_type=DataTypes.STRING()) def update_tel(data): json_data = json.loads(data) json_data['tel'] += 1 return json.dumps(json_data) table = table.select(table.id, update_tel(table.data)) # execute table.execute_insert('sink') \ .wait()