def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def demo02(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input' ) """ my_sink_ddl = """ create table mySink ( word VARCHAR, `count` BIGINT ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output' ) """ t_env.execute_sql(my_source_ddl) t_env.execute_sql(my_sink_ddl) tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def test_tumble_group_window_aggregate_function(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b TIMESTAMP(3), c FLOAT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) tumble_window = Tumble.over(lit(1).hours) \ .on(col("rowtime")) \ .alias("w") t.window(tumble_window) \ .group_by(col("w")) \ .select(col("w").start, col("w").end, mean_udaf(t.b)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11T03:00, 2018-03-11T04:00, 2.2]", "+I[2018-03-11T04:00, 2018-03-11T05:00, 8.0]"])
def test_tumbling_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:30:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.create_temporary_system_function( "my_count", CountDistinctAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT, e BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, t.c.count.alias("c"), call("my_count", t.c).alias("d")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2, 1]", "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 1, 1]", "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2, 2]", "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 1, 1]" ])
def test_session_window(self): t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"]) result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\ .group_by(expr.col('w'), expr.col('c')).select(t.b.sum) query_operation = result._j_table.getQueryOperation().getChildren().get(0) self.assertEqual('[c]', query_operation.getGroupingExpressions().toString()) self.assertEqual('SessionWindow(field: [a], gap: [1000])', query_operation.getGroupWindow().asSummaryString())
def test_session_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_session_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_count", CountAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]", "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]", "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]", "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t = t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) result = self.collect(t) self.assertEqual(result, ["3,1,1", "7,2,1", "4,3,1"])
def demo01(): # environment configuration t_env = BatchTableEnvironment.create(environment_settings=EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()) # register Orders table and Result table sink in table environment source_data_path = "/path/to/source/directory/" result_data_path = "/path/to/result/directory/" source_ddl = f""" create table Orders( a VARCHAR, b BIGINT, c BIGINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND ) with ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '{source_data_path}' ) """ t_env.execute_sql(source_ddl) sink_ddl = f""" create table `Result`( a VARCHAR, cnt BIGINT ) with ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '{result_data_path}' ) """ t_env.execute_sql(sink_ddl) # specify table program orders = t_env.from_path("Orders") # schema (a, b, c, rowtime) orders.group_by("a").select(orders.a, orders.b.count.alias('cnt')).execute_insert("result").wait() orders.where(orders.a == 'red') orders.filter(orders.b % 2 == 0) orders.add_columns(concat(orders.c, 'sunny')) orders.add_or_replace_columns(concat(orders.c, 'sunny').alias('desc')) orders.drop_columns(orders.b, orders.c) orders.rename_columns(orders.b.alias('b2'), orders.c.alias('c2')) orders.group_by(orders.a).select(orders.a, orders.b.sum.alias('d')) # tab.group_by(tab.key).select(tab.key, tab.value.avg.alias('average')) # tab.group_by("key").select("key, value.avg as average") result = orders.filter(orders.a.is_not_null & orders.b.is_not_null & orders.c.is_not_null) \ .select(orders.a.lower_case.alias('a'), orders.b, orders.rowtime) \ .window(Tumble.over(lit(1).hour).on(orders.rowtime).alias("hourly_window")) \ .group_by(col('hourly_window'), col('a')) \ .select(col('a'), col('hourly_window').end.alias('hour'), col('b').avg.alias('avg_billing_amount')) """
def test_expressions(self): expr1 = col('a') expr2 = col('b') expr3 = col('c') self.assertEqual('10', str(lit(10, DataTypes.INT(False)))) self.assertEqual('rangeTo(1, 2)', str(range_(1, 2))) self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3))) self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3))) from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \ CURRENT_RANGE self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW)) self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE)) self.assertEqual('currentRow()', str(CURRENT_ROW)) self.assertEqual('currentRange()', str(CURRENT_RANGE)) self.assertEqual('currentDate()', str(current_date())) self.assertEqual('currentTime()', str(current_time())) self.assertEqual('currentTimestamp()', str(current_timestamp())) self.assertEqual('localTime()', str(local_time())) self.assertEqual('localTimestamp()', str(local_timestamp())) self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0))) self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, " "cast('3:30:00', TIME(0)), 7200000)", str(temporal_overlaps( lit("2:55:00").to_time, lit(1).hours, lit("3:30:00").to_time, lit(2).hours))) self.assertEqual("dateFormat(time, '%Y, %d %M')", str(date_format(col("time"), "%Y, %d %M"))) self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))", str(timestamp_diff( TimePointUnit.DAY, lit("2016-06-15").to_date, lit("2016-06-18").to_date))) self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3))) self.assertEqual("row('key1', 1)", str(row("key1", 1))) self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)", str(map_("key1", 1, "key2", 2, "key3", 3))) self.assertEqual('4', str(row_interval(4))) self.assertEqual('pi()', str(pi())) self.assertEqual('e()', str(e())) self.assertEqual('rand(4)', str(rand(4))) self.assertEqual('randInteger(4)', str(rand_integer(4))) self.assertEqual('atan2(1, 2)', str(atan2(1, 2))) self.assertEqual('minusPrefix(a)', str(negative(expr1))) self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3))) self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3))) self.assertEqual('uuid()', str(uuid())) self.assertEqual('null', str(null_of(DataTypes.BIGINT()))) self.assertEqual('log(a)', str(log(expr1))) self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3))) self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3))) self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
def pandas_udaf(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() # define the tumble window operation table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, mean_udaf(table.price), col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def word_count(input_path, output_path): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # write all the data to one file t_env.get_config().get_configuration().set_string("parallelism.default", "1") # define the source if input_path is not None: t_env.create_temporary_table( 'source', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column( 'word', DataTypes.STRING()).build()).option( 'path', input_path).format('csv').build()) tab = t_env.from_path('source') else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") tab = t_env.from_elements( map(lambda i: (i, ), word_count_data), DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())])) # define the sink if output_path is not None: t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).option('path', output_path). format(FormatDescriptor.for_format('canal-json').build()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).build()) @udtf(result_types=[DataTypes.STRING()]) def split(line: Row): for s in line[0].split(): yield Row(s) # compute word count tab.flat_map(split).alias('word') \ .group_by(col('word')) \ .select(col('word'), lit(1).count) \ .execute_insert('sink') \ .wait()
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
def test_left_outer_join_lateral_with_join_predicate(self): t_env = self.t_env t_env.create_java_temporary_system_function("split", "org.apache.flink.table.utils.TableFunc1") source = t_env.from_elements([("1", "1#3#5#7"), ("2", "2#4#6#8")], ["id", "words"]) # only support "true" as the join predicate currently result = source.left_outer_join_lateral(expr.call('split', source.words).alias('word'), expr.lit(True)) query_operation = result._j_table.getQueryOperation() self.assertEqual('LEFT_OUTER', query_operation.getJoinType().toString()) self.assertTrue(query_operation.isCorrelated()) self.assertEqual('true', query_operation.getCondition().toString())
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() #t_config.set_python_executable("/opt/python38/bin/python3") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results").wait()
def test_chaining_scalar_function(self): add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT()) subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT()) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c']) t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by("w") \ .aggregate(pandas_udaf.alias("d", "e")) \ .select("w.rowtime, d, e") \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["2018-03-11 03:59:59.999,2.2,3", "2018-03-11 04:59:59.999,8.0,8"])
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build() t_env = BatchTableEnvironment.create(environment_settings=env_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .insert_into("Results") t_env.execute("word_count")
def main(): env = StreamExecutionEnvironment.get_execution_environment() env.add_jars("file:///app/src/kafka-clients-2.8.0.jar") env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar") env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE) config = env.get_checkpoint_config() config.enable_externalized_checkpoints( ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION) st_env = StreamTableEnvironment.create( env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) print("register kafka source") register_kafka_source(st_env) print("register transaction sinks") register_transactions_sink_into_csv(st_env) st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select("""count(message) as total, w.end as end_time """) \ .insert_into("total_sink") st_env.from_path("source_tbl") \ .where("message = 'dolorem'") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select(""" count(message) as total, w.end as end_time """) \ .insert_into("grep_sink") st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w"), col("message")) \ .select(""" count(message) as total, message, w.end as end_time """) \ .insert_into("topk_sink") st_env.execute("app")
def test_window_aggregate_with_pandas_udaf(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) sink_table_ddl = """ CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) print(t.get_schema()) pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()), result_type=DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.FLOAT()), DataTypes.FIELD("b", DataTypes.INT())]), func_type="pandas") tumble_window = Tumble.over(expr.lit(1).hours) \ .on(expr.col("rowtime")) \ .alias("w") t.select(t.b, t.rowtime) \ .window(tumble_window) \ .group_by(expr.col("w")) \ .aggregate(pandas_udaf.alias("d", "e")) \ .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11 03:59:59.999, 2.2, 3]", "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
def test_tumble_group_window_aggregate_function(self): import datetime from pyflink.table.window import Tumble t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c'], [ DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) tumble_window = Tumble.over(lit(1).hours) \ .on(col("rowtime")) \ .alias("w") t.window(tumble_window) \ .group_by(col("w")) \ .select(col("w").start, col("w").end, mean_udaf(t.b)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.2]", "+I[2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0]"])
def test_tumbling_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Tumble self.t_env.get_config().set( "pipeline.time-characteristic", "EventTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, col("w").rowtime, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.5]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 2018-03-11 04:59:59.999, 8.0]", "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.0]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.0]", ]) os.remove(source_path)
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem from pyflink.table.expressions import lit exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def test_expression(self): expr1 = col('a') expr2 = col('b') expr3 = col('c') expr4 = col('d') expr5 = lit(10) # comparison functions self.assertEqual('equals(a, b)', str(expr1 == expr2)) self.assertEqual('mod(2, b)', str(2 % expr2)) self.assertEqual('notEquals(a, b)', str(expr1 != expr2)) self.assertEqual('lessThan(a, b)', str(expr1 < expr2)) self.assertEqual('lessThanOrEqual(a, b)', str(expr1 <= expr2)) self.assertEqual('greaterThan(a, b)', str(expr1 > expr2)) self.assertEqual('greaterThanOrEqual(a, b)', str(expr1 >= expr2)) # logic functions self.assertEqual('and(a, b)', str(expr1 & expr2)) self.assertEqual('or(a, b)', str(expr1 | expr2)) self.assertEqual('isNotTrue(a)', str(expr1.is_not_true)) self.assertEqual('isNotTrue(a)', str(~expr1)) # arithmetic functions self.assertEqual('plus(a, b)', str(expr1 + expr2)) self.assertEqual('plus(2, b)', str(2 + expr2)) self.assertEqual('plus(cast(b, DATE), 2)', str(expr2.to_date + 2)) self.assertEqual('minus(a, b)', str(expr1 - expr2)) self.assertEqual('minus(cast(b, DATE), 2)', str(expr2.to_date - 2)) self.assertEqual('times(a, b)', str(expr1 * expr2)) self.assertEqual('divide(a, b)', str(expr1 / expr2)) self.assertEqual('mod(a, b)', str(expr1 % expr2)) self.assertEqual('power(a, b)', str(expr1**expr2)) self.assertEqual('minusPrefix(a)', str(-expr1)) self.assertEqual('exp(a)', str(expr1.exp)) self.assertEqual('log10(a)', str(expr1.log10)) self.assertEqual('log2(a)', str(expr1.log2)) self.assertEqual('ln(a)', str(expr1.ln)) self.assertEqual('log(a)', str(expr1.log())) self.assertEqual('cosh(a)', str(expr1.cosh)) self.assertEqual('sinh(a)', str(expr1.sinh)) self.assertEqual('sin(a)', str(expr1.sin)) self.assertEqual('cos(a)', str(expr1.cos)) self.assertEqual('tan(a)', str(expr1.tan)) self.assertEqual('cot(a)', str(expr1.cot)) self.assertEqual('asin(a)', str(expr1.asin)) self.assertEqual('acos(a)', str(expr1.acos)) self.assertEqual('atan(a)', str(expr1.atan)) self.assertEqual('tanh(a)', str(expr1.tanh)) self.assertEqual('degrees(a)', str(expr1.degrees)) self.assertEqual('radians(a)', str(expr1.radians)) self.assertEqual('sqrt(a)', str(expr1.sqrt)) self.assertEqual('abs(a)', str(expr1.abs)) self.assertEqual('abs(a)', str(abs(expr1))) self.assertEqual('sign(a)', str(expr1.sign)) self.assertEqual('round(a, b)', str(expr1.round(expr2))) self.assertEqual('between(a, b, c)', str(expr1.between(expr2, expr3))) self.assertEqual('notBetween(a, b, c)', str(expr1.not_between(expr2, expr3))) self.assertEqual('ifThenElse(a, b, c)', str(expr1.then(expr2, expr3))) self.assertEqual('isNull(a)', str(expr1.is_null)) self.assertEqual('isNotNull(a)', str(expr1.is_not_null)) self.assertEqual('isTrue(a)', str(expr1.is_true)) self.assertEqual('isFalse(a)', str(expr1.is_false)) self.assertEqual('isNotTrue(a)', str(expr1.is_not_true)) self.assertEqual('isNotFalse(a)', str(expr1.is_not_false)) self.assertEqual('distinct(a)', str(expr1.distinct)) self.assertEqual('sum(a)', str(expr1.sum)) self.assertEqual('sum0(a)', str(expr1.sum0)) self.assertEqual('min(a)', str(expr1.min)) self.assertEqual('max(a)', str(expr1.max)) self.assertEqual('count(a)', str(expr1.count)) self.assertEqual('avg(a)', str(expr1.avg)) self.assertEqual('first_value(a)', str(expr1.first_value)) self.assertEqual('last_value(a)', str(expr1.last_value)) self.assertEqual('stddevPop(a)', str(expr1.stddev_pop)) self.assertEqual('stddevSamp(a)', str(expr1.stddev_samp)) self.assertEqual('varPop(a)', str(expr1.var_pop)) self.assertEqual('varSamp(a)', str(expr1.var_samp)) self.assertEqual('collect(a)', str(expr1.collect)) self.assertEqual("as(a, 'a', 'b', 'c')", str(expr1.alias('a', 'b', 'c'))) self.assertEqual('cast(a, INT)', str(expr1.cast(DataTypes.INT()))) self.assertEqual('asc(a)', str(expr1.asc)) self.assertEqual('desc(a)', str(expr1.desc)) self.assertEqual('in(a, b, c, d)', str(expr1.in_(expr2, expr3, expr4))) self.assertEqual('start(a)', str(expr1.start)) self.assertEqual('end(a)', str(expr1.end)) self.assertEqual('bin(a)', str(expr1.bin)) self.assertEqual('hex(a)', str(expr1.hex)) self.assertEqual('truncate(a, 3)', str(expr1.truncate(3))) # string functions self.assertEqual('substring(a, b, 3)', str(expr1.substring(expr2, 3))) self.assertEqual("trim(true, false, ' ', a)", str(expr1.trim_leading())) self.assertEqual("trim(false, true, ' ', a)", str(expr1.trim_trailing())) self.assertEqual("trim(true, true, ' ', a)", str(expr1.trim())) self.assertEqual('replace(a, b, c)', str(expr1.replace(expr2, expr3))) self.assertEqual('charLength(a)', str(expr1.char_length)) self.assertEqual('upper(a)', str(expr1.upper_case)) self.assertEqual('lower(a)', str(expr1.lower_case)) self.assertEqual('initCap(a)', str(expr1.init_cap)) self.assertEqual("like(a, 'Jo_n%')", str(expr1.like('Jo_n%'))) self.assertEqual("similar(a, 'A+')", str(expr1.similar('A+'))) self.assertEqual('position(a, b)', str(expr1.position(expr2))) self.assertEqual('lpad(a, 4, b)', str(expr1.lpad(4, expr2))) self.assertEqual('rpad(a, 4, b)', str(expr1.rpad(4, expr2))) self.assertEqual('overlay(a, b, 6, 2)', str(expr1.overlay(expr2, 6, 2))) self.assertEqual("regexpReplace(a, b, 'abc')", str(expr1.regexp_replace(expr2, 'abc'))) self.assertEqual('regexpExtract(a, b, 3)', str(expr1.regexp_extract(expr2, 3))) self.assertEqual('fromBase64(a)', str(expr1.from_base64)) self.assertEqual('toBase64(a)', str(expr1.to_base64)) self.assertEqual('ltrim(a)', str(expr1.ltrim)) self.assertEqual('rtrim(a)', str(expr1.rtrim)) self.assertEqual('repeat(a, 3)', str(expr1.repeat(3))) self.assertEqual("over(a, 'w')", str(expr1.over('w'))) # temporal functions self.assertEqual('cast(a, DATE)', str(expr1.to_date)) self.assertEqual('cast(a, TIME(0))', str(expr1.to_time)) self.assertEqual('cast(a, TIMESTAMP(3))', str(expr1.to_timestamp)) self.assertEqual('extract(YEAR, a)', str(expr1.extract(TimeIntervalUnit.YEAR))) self.assertEqual('floor(a, YEAR)', str(expr1.floor(TimeIntervalUnit.YEAR))) self.assertEqual('ceil(a)', str(expr1.ceil())) # advanced type helper functions self.assertEqual("get(a, 'col')", str(expr1.get('col'))) self.assertEqual('flatten(a)', str(expr1.flatten)) self.assertEqual('at(a, 0)', str(expr1.at(0))) self.assertEqual('cardinality(a)', str(expr1.cardinality)) self.assertEqual('element(a)', str(expr1.element)) # time definition functions self.assertEqual('rowtime(a)', str(expr1.rowtime)) self.assertEqual('proctime(a)', str(expr1.proctime)) self.assertEqual('120', str(expr5.year)) self.assertEqual('120', str(expr5.years)) self.assertEqual('30', str(expr5.quarter)) self.assertEqual('30', str(expr5.quarters)) self.assertEqual('10', str(expr5.month)) self.assertEqual('10', str(expr5.months)) self.assertEqual('6048000000', str(expr5.week)) self.assertEqual('6048000000', str(expr5.weeks)) self.assertEqual('864000000', str(expr5.day)) self.assertEqual('864000000', str(expr5.days)) self.assertEqual('36000000', str(expr5.hour)) self.assertEqual('36000000', str(expr5.hours)) self.assertEqual('600000', str(expr5.minute)) self.assertEqual('600000', str(expr5.minutes)) self.assertEqual('10000', str(expr5.second)) self.assertEqual('10000', str(expr5.seconds)) self.assertEqual('10', str(expr5.milli)) self.assertEqual('10', str(expr5.millis)) # hash functions self.assertEqual('md5(a)', str(expr1.md5)) self.assertEqual('sha1(a)', str(expr1.sha1)) self.assertEqual('sha224(a)', str(expr1.sha224)) self.assertEqual('sha256(a)', str(expr1.sha256)) self.assertEqual('sha384(a)', str(expr1.sha384)) self.assertEqual('sha512(a)', str(expr1.sha512)) self.assertEqual('sha2(a, 224)', str(expr1.sha2(224))) # json functions self.assertEqual("IS_JSON('42')", str(lit('42').is_json())) self.assertEqual("IS_JSON('42', SCALAR)", str(lit('42').is_json(JsonType.SCALAR))) self.assertEqual("JSON_EXISTS('{}', '$.x')", str(lit('{}').json_exists('$.x'))) self.assertEqual( "JSON_EXISTS('{}', '$.x', FALSE)", str(lit('{}').json_exists('$.x', JsonExistsOnError.FALSE))) self.assertEqual( "JSON_VALUE('{}', '$.x', STRING, NULL, null, NULL, null)", str(lit('{}').json_value('$.x'))) self.assertEqual( "JSON_VALUE('{}', '$.x', INT, DEFAULT, 42, ERROR, null)", str( lit('{}').json_value('$.x', DataTypes.INT(), JsonValueOnEmptyOrError.DEFAULT, 42, JsonValueOnEmptyOrError.ERROR, None))) self.assertEqual( "JSON_QUERY('{}', '$.x', WITHOUT_ARRAY, NULL, EMPTY_ARRAY)", str( lit('{}').json_query('$.x', JsonQueryWrapper.WITHOUT_ARRAY, JsonQueryOnEmptyOrError.NULL, JsonQueryOnEmptyOrError.EMPTY_ARRAY)))
def log_processing(): env_settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() t_env = StreamTableEnvironment.create(environment_settings=env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar" ) source_ddl = """ CREATE TABLE source_table( token VARCHAR, stime BIGINT, appKey VARCHAR, user_action_time AS PROCTIME() ) WITH ( 'connector' = 'kafka', 'topic' = 'markTopic', 'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092', 'properties.group.id' = 'test_3', 'scan.startup.mode' = 'earliest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( token VARCHAR, appKey VARCHAR, stime TIMESTAMP(3) NOT NULL, nums BIGINT NOT NULL ) WITH ( 'connector' = 'kafka', 'topic' = 'markTopic1', 'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) query_sql = """ SELECT token, appKey, TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, COUNT(token) as nums FROM source_table WHERE appKey = 'YSHAppAndroidIOSH5' GROUP BY token, appKey, TUMBLE(user_action_time, INTERVAL '5' MINUTE) """ # t_env.sql_query(query_sql) \ # .execute_insert("sink_table").wait() source_t = t_env.from_path("source_table") result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \ .window(Slide.over(lit(1).days) \ .every(lit(1).minutes) \ .on(source_t.user_action_time).alias("w")) \ .group_by(source_t.token, source_t.appKey, col("w")) \ .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums")) result.execute_insert("sink_table").wait()
def test_sliding_group_window_over_time(self): # create source file path import tempfile import os tmp_dir = tempfile.gettempdir() data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') from pyflink.table.window import Slide self.t_env.get_config().set( "pipeline.time-characteristic", "EventTime") self.t_env.register_function("mean_udaf", mean_udaf) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d FLOAT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Slide.over(lit(1).hours) .every(lit(30).minutes) .on(col("rowtime")) .alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, mean_udaf(t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2018-03-11T02:30, 2018-03-11T03:30, 2.0]", "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2.5]", "+I[1, 2018-03-11T03:30, 2018-03-11T04:30, 5.5]", "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 8.0]", "+I[2, 2018-03-11T02:30, 2018-03-11T03:30, 1.0]", "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2.0]", "+I[2, 2018-03-11T03:30, 2018-03-11T04:30, 3.0]", "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 2.0]", "+I[3, 2018-03-11T02:30, 2018-03-11T03:30, 2.0]"]) os.remove(source_path)
import os import shutil sink_path = tempfile.gettempdir() + '/batch.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table( "csv_sink", TableDescriptor.for_connector("filesystem").schema( Schema.new_builder().column( "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column( "c", DataTypes.STRING()).build()).option("path", sink_path).format( FormatDescriptor.for_format("csv").option( "field-delimiter", ",").build()).build()) t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait() with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print('pip_test_code.py success!')