def test_register_table_source(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env sink_path = os.path.join(self.tempdir + '/streaming2.csv') if os.path.isfile(sink_path): os.remove(sink_path) t_env.register_table_sink( "sink", field_names, field_types, CsvTableSink(sink_path)) # connect source t_env.connect(FileSystem().path(source_path))\ .with_format(OldCsv() .field_delimiter(',') .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .with_schema(Schema() .field("a", DataTypes.INT()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()))\ .register_table_source("source") t_env.scan("source") \ .select("a + 1, b, c") \ .insert_into("sink") t_env.execute() with open(sink_path, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'
def test_list_tables(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Orders", csv_source) tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' t_env.register_table_sink("Sinks", field_names, field_types, CsvTableSink(tmp_csv)) t_env.register_table_sink("Results", field_names, field_types, CsvTableSink(tmp_csv)) actual = t_env.list_tables() expected = ['Orders', 'Results', 'Sinks'] self.assert_equals(actual, expected)
def word_count(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" with open(source_path, 'w') as f: for word in content.split(" "): f.write(",".join([word, "1"])) f.write("\n") f.flush() f.close() t_config = TableConfig.Builder().as_batch_execution().set_parallelism( 1).build() t_env = TableEnvironment.create(t_config) field_names = ["word", "cout"] field_types = [DataTypes.STRING, DataTypes.LONG] # register Orders table in table environment t_env.register_table_source( "Word", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink("Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Word") \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def test_end_to_end(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() _find_flink_home() print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"]) t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build() t_env = TableEnvironment.get_table_environment(t_config) field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink( "Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Orders") \ .where("a > 0") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
def test_sql_update(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("source", csv_source) tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink("sinks", field_names, field_types, CsvTableSink(tmp_csv)) t_env.sql_update("insert into sinks select * from source") t_env.execute("test_sql_job") with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '1,Hi,Hello\n' + '2,Hello,Hello\n'
def test_register_table_source_sink(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_source("Orders", csv_source) t_env.register_table_sink("Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Orders").insert_into("Results") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '1,Hi,Hello\n'
def test_sql_query(self): source_path = os.path.join(self.tempdir + '/streaming.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env = self.t_env t_env.register_table_source("Source", csv_source) source = t_env.scan("Source") tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink("sinks", field_names, field_types, CsvTableSink(tmp_csv)) result = t_env.sql_query("select a + 1, b, c from %s" % source) result.insert_into("sinks") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '2,Hi,Hello\n' + '3,Hello,Hello\n'