def table_func_python_sql_join_lateral_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], ["a", "b"]).select("a, b") result_file = "/tmp/table_func_python_sql_join_lateral_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c"], [DataTypes.STRING(), DataTypes.STRING(), DataTypes.INT()], result_file)) bt_env.register_java_function("split", "com.pyflink.table.Split") bt_env.register_table("MyTable", source_table) result = bt_env.sql_query( "SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)" ) result.insert_into("result") bt_env.execute("table func python sql join lateral api")
def minus_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = "/tmp/table_minus_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.minus(right) result.insert_into("result") bt_env.execute("minus batch")
def select_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_select_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "c"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = st_env.scan("Orders") result = orders.select("a, b") result.insert_into("result") st_env.execute("select streaming")
def input_output_table(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) statement_set = table_env.create_statement_set() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/input_output.py" prop = {} func = "map_func" env_path = None prop[ MLCONSTANTS. ENCODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding" prop[ MLCONSTANTS. DECODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding" inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING" prop["sys:csv_encode_types"] = inputSb prop["sys:csv_decode_types"] = inputSb prop[MLCONSTANTS.PYTHON_VERSION] = "3.7" source_file = os.getcwd() + "/../../src/test/resources/input.csv" sink_file = os.getcwd() + "/../../src/test/resources/output.csv" table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ]) table_env.register_table_source("source", table_source) input_tb = table_env.from_path("source") output_schema = TableSchema(["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ]) sink = CsvTableSink(["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ], sink_file, write_mode=WriteMode.OVERWRITE) table_env.register_table_sink("table_row_sink", sink) tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) output_table = train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) # output_table = inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) statement_set.add_insert("table_row_sink", output_table) job_client = statement_set.execute().get_job_client() if job_client is not None: job_client.get_job_execution_result( user_class_loader=None).result()
def test_table_environment_with_blink_planner(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/result.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.scan("source") result = source.alias("a, b, c").select("1 + a, b, c") result.insert_into("sink") t_env.execute("blink_test") results = [] with open(sink_path, 'r') as f: results.append(f.readline()) results.append(f.readline()) self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def group_by_window_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_group_by_window_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "start", "end", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP(), DataTypes.INT() ], result_file)) orders = bt_env.scan("Orders") result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("a, w") \ .select("a, w.start, w.end, w.rowtime, b.sum as d") result.insert_into("result") bt_env.execute("group by agg batch")
def test_table_environment_with_blink_planner(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/results') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.scan("source") result = source.alias("a, b, c").select("1 + a, b, c") result.insert_into("sink") t_env.execute("blink_test") results = [] for root, dirs, files in os.walk(sink_path): for sub_file in files: with open(os.path.join(root, sub_file), 'r') as f: line = f.readline() while line is not None and line != '': results.append(line) line = f.readline() self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def filter_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_filter_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ], result_file)) orders = st_env.scan("Orders") result = orders.filter("b % 2 === 0") result.insert_into("result") st_env.execute("filter streaming")
def union(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = os.getcwd() + "/tmp/table_union_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "1b", "1bb"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["a", "b", "c"]).select("a, b, c") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.union(right) #result = left.union_all(right) result.insert_into("result") bt_env.execute("union") with open(result_file, 'r') as f: print(f.read())
def select(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_select.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "c"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.select("a, b") result.insert_into("result") bt_env.execute("select") with open(result_file, 'r') as f: print(f.read())
def test_execute(self): tmp_dir = tempfile.gettempdir() field_names = ['a', 'b', 'c'] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env = StreamTableEnvironment.create(self.env) t_env.register_table_sink( 'Results', CsvTableSink( field_names, field_types, os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time()))))) t_env.insert_into( 'Results', t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c'])) execution_result = t_env.execute('test_stream_execute') self.assertIsNotNone(execution_result.get_job_id()) self.assertTrue(execution_result.is_job_execution_result()) self.assertIsNotNone( execution_result.get_job_execution_result().get_job_id()) self.assertIsNotNone(execution_result.get_net_runtime()) self.assertEqual(len(execution_result.get_all_accumulator_results()), 0) self.assertIsNone( execution_result.get_accumulator_result('accumulator')) self.assertIsNotNone(execution_result.to_string())
def add_columns_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_add_columns_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime", "d"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP(), DataTypes.STRING() ], result_file)) orders = bt_env.scan("Orders") result = orders.add_columns("concat(a, '_sunny') as d") result.insert_into("result") bt_env.execute("add columns batch")
def left_outer_join_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file = "/tmp/table_left_outer_join_batch.csv" if os.path.exists(result_file): os.remove(result_file) left = bt_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") bt_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.left_outer_join(right, "a = d").select("a, b, e") result.insert_into("result") bt_env.execute("left outer join batch")
def inner_join_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/table_inner_join_streaming.csv" if os.path.exists(result_file): os.remove(result_file) left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") st_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.join(right).where("a = d").select("a, b, e") result.insert_into("result") st_env.execute("inner join streaming")
def where_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = os.getcwd() + "/../result/table_where_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ], result_file)) orders = bt_env.scan("Orders") result = orders.where("a === 'b'") result.insert_into("result") bt_env.execute("where batch")
def offset_and_fetch_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv" result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv" result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv" if os.path.exists(result_file_1): os.remove(result_file_1) if os.path.exists(result_file_2): os.remove(result_file_2) if os.path.exists(result_file_3): os.remove(result_file_3) bt_env.register_table_sink("result1", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_1)) bt_env.register_table_sink("result2", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_2)) bt_env.register_table_sink("result3", CsvTableSink(["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file_3)) left = bt_env.from_elements( [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], ["a", "b", "c"]).select("a, b, c") ordered_table = left.order_by("a.asc") ordered_table.fetch(5).insert_into("result1") ordered_table.offset(1).insert_into("result2") ordered_table.offset(1).fetch(2).insert_into("result3") bt_env.execute("offset and fetch batch")
def test_explain_with_multi_sinks(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "sink1", CsvTableSink(field_names, field_types, "path1")) t_env.register_table_sink( "sink2", CsvTableSink(field_names, field_types, "path2")) t_env.sql_update("insert into sink1 select * from %s where a > 100" % source) t_env.sql_update("insert into sink2 select * from %s where a < 100" % source) actual = t_env.explain(extended=True) self.assertIsInstance(actual, str)
def test_explain_with_multi_sinks(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "sink1", CsvTableSink(field_names, field_types, "path1")) t_env.register_table_sink( "sink2", CsvTableSink(field_names, field_types, "path2")) stmt_set = t_env.create_statement_set() stmt_set.add_insert_sql("insert into sink1 select * from %s where a > 100" % source) stmt_set.add_insert_sql("insert into sink2 select * from %s where a < 100" % source) actual = stmt_set.explain(ExplainDetail.ESTIMATED_COST, ExplainDetail.CHANGELOG_MODE) self.assertIsInstance(actual, str)
def test_statement_set(self): t_env = self.t_env source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink( "sink1", CsvTableSink(field_names, field_types, "path1")) t_env.register_table_sink( "sink2", CsvTableSink(field_names, field_types, "path2")) stmt_set = t_env.create_statement_set() stmt_set.add_insert_sql("insert into sink1 select * from %s where a > 100" % source) \ .add_insert("sink2", source.filter("a < 100")) actual = stmt_set.explain() assert isinstance(actual, str)
def test_explain_with_multi_sinks_with_blink_planner(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).in_batch_mode().use_blink_planner().build()) source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"]) field_names = ["a", "b", "c"] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env.register_table_sink( "sink1", CsvTableSink(field_names, field_types, "path1")) t_env.register_table_sink( "sink2", CsvTableSink(field_names, field_types, "path2")) t_env.sql_update("insert into sink1 select * from %s where a > 100" % source) t_env.sql_update("insert into sink2 select * from %s where a < 100" % source) actual = t_env.explain(extended=True) self.assertIsInstance(actual, (str, unicode))
def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None: t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() dummy_output_path = function_context.get_example_meta().batch_uri if os.path.exists(dummy_output_path): if os.path.isdir(dummy_output_path): shutil.rmtree(dummy_output_path) else: os.remove(dummy_output_path) sink = CsvTableSink( ['a', 'b', 'c'], [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING()], dummy_output_path, ';') t_env.register_table_sink('mySink', sink) statement_set.add_insert("mySink", input_table)
def execute(self, function_context: FlinkFunctionContext, input_table: Table) -> None: example_meta: ExampleMeta = function_context.get_example_meta() output_file = example_meta.batch_uri if os.path.exists(output_file): if os.path.isdir(output_file): shutil.rmtree(output_file) else: os.remove(output_file) t_env = function_context.get_table_env() statement_set = function_context.get_statement_set() sink = CsvTableSink( ['a', 'b'], [DataTypes.STRING(), DataTypes.STRING()], output_file, ';') t_env.register_table_sink('mySink', sink) statement_set.add_insert('mySink', input_table)
def test_get_execution_plan(self): tmp_dir = tempfile.gettempdir() source_path = os.path.join(tmp_dir + '/streaming.csv') tmp_csv = os.path.join(tmp_dir + '/streaming2.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] t_env = BatchTableEnvironment.create(self.env) csv_source = CsvTableSource(source_path, field_names, field_types) t_env.register_table_source("Orders", csv_source) t_env.register_table_sink( "Results", CsvTableSink(field_names, field_types, tmp_csv)) t_env.scan("Orders").insert_into("Results") plan = self.env.get_execution_plan() json.loads(plan)
def custom_test_source_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/custom_test_source_demo.csv" if os.path.exists(result_file): os.remove(result_file) custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False) st_env.connect(custom_connector) \ .with_schema( Schema() .field("a", DataTypes.STRING()) ).register_table_source("source") st_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.STRING()], result_file)) orders = st_env.scan("source") orders.insert_into("result") st_env.execute("custom test source demo")
def aggregate_func_python_table_api(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2), ("a", 5, 2)], ["user", "points", "level"]) result_file = "/tmp/aggregate_func_python_table_api.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.BIGINT()], result_file)) bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg") result = source_table.group_by("user").select( "user, wAvg(points, level) as avgPoints") result.insert_into("result") bt_env.execute("aggregate func python table api")
def slide_time_window_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_slide_time_window_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["a"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \ .group_by("w").select("b.sum") result.insert_into("result") bt_env.execute("slide time window batch")
def distinct_agg_batch(): b_env = ExecutionEnvironment.get_execution_environment() b_env.set_parallelism(1) bt_env = BatchTableEnvironment.create(b_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_distinct_agg_batch.csv" if os.path.exists(result_file): os.remove(result_file) bt_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) bt_env.register_table_sink( "result", CsvTableSink(["b"], [DataTypes.INT()], result_file)) orders = bt_env.scan("Orders") result = orders.group_by("a") \ .select("b.sum.distinct as d") result.insert_into("result") bt_env.execute("distinct agg batch")
.connect(custom_connector) \ .with_format( custom_format ) \ .with_schema( # declare the schema of the table Schema() .field("proctime", DataTypes.TIMESTAMP()) .proctime() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("custom kafka source demo") # cat /tmp/custom_kafka_source_demo.csv # a,3 # b,4 # a 5
def tumble_time_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/tumble_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("tumble time window streaming")
def custom_kafka_source_demo(): custom_connector = CustomConnectorDescriptor('kafka', 1, True) \ .property('connector.topic', 'user') \ .property('connector.properties.0.key', 'zookeeper.connect') \ .property('connector.properties.0.value', 'localhost:2181') \ .property('connector.properties.1.key', 'bootstrap.servers') \ .property('connector.properties.1.value', 'localhost:9092') \ .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'}) # the key is 'format.json-schema' custom_format = CustomFormatDescriptor('json', 1) \ .property('format.json-schema', "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}") \ .properties({'format.fail-on-missing-field': 'true'}) s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/custom_kafka_source_demo.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect(custom_connector) \ .with_format( custom_format ) \ .with_schema( # declare the schema of the table Schema() .field("proctime", DataTypes.TIMESTAMP()) .proctime() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink( "result", CsvTableSink( ["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("custom kafka source demo")