def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps() .with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream(ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) self.assertEqual("""( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view("t", ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) result = self.t_env.execute_sql("SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [item for item in map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_table_from_descriptor(self): from pyflink.table.schema import Schema schema = Schema.new_builder().column("f0", DataTypes.INT()).build() descriptor = TableDescriptor.for_connector("fake").schema(schema).build() table = self.t_env.from_descriptor(descriptor) self.assertEqual(schema, Schema(Schema.new_builder()._j_builder .fromResolvedSchema(table._j_table.getResolvedSchema()).build())) contextResolvedTable = table._j_table.getQueryOperation().getContextResolvedTable() options = contextResolvedTable.getTable().getOptions() self.assertEqual("fake", options.get("connector"))
def test_from_and_to_changelog_stream_event_time(self): from pyflink.table import Schema self.env.set_parallelism(1) ds = self.env.from_collection( [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW([Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner())) changelog_stream = ds.map(lambda t: Row(t.f1, t.f2), Types.ROW([Types.INT(), Types.STRING()])) # derive physical columns and add a rowtime table = self.t_env.from_changelog_stream( changelog_stream, Schema.new_builder().column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "computed", str(col("f1").upper_case)).watermark( "rowtime", str(source_watermark())).build()) self.t_env.create_temporary_view("t", table) # access and reorder columns reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t") # write out the rowtime column with fully declared schema result = self.t_env.to_changelog_stream( reordered, Schema.new_builder().column( "f1", DataTypes.STRING()).column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "ignored", str(col("f1").upper_case)).column( "f0", DataTypes.INT()).build()) # test event time window and field access result.key_by(lambda k: k.f1) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(A,47)', '(C,1000)', '(C,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_from_data_stream_with_schema(self): from pyflink.table import Schema ds = self.env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW_NAMED( ["a", "b", "c"], [Types.INT(), Types.STRING(), Types.STRING()])) table = self.t_env.from_data_stream( ds, Schema.new_builder().column("a", DataTypes.INT()).column( "b", DataTypes.STRING()).column("c", DataTypes.STRING()).build()) result = table.execute() with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [ item for item in map( str, [Row(1, 'Hi', 'Hello'), Row(2, 'Hello', 'Hi')]) ] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result)
def test_to_string(self): schema = Schema.new_builder().column("f0", DataTypes.STRING()).build() format_descriptor = FormatDescriptor \ .for_format("test-format") \ .option(self.option_a, False) \ .build() table_descriptor = TableDescriptor.for_connector("test-connector") \ .schema(schema) \ .partitioned_by("f0") \ .option(self.option_a, True) \ .format(format_descriptor) \ .comment("Test Comment") \ .build() self.assertEqual("test-format[{a=false}]", str(format_descriptor)) self.assertEqual( """( `f0` STRING ) COMMENT 'Test Comment' PARTITIONED BY (`f0`) WITH ( 'a' = 'true', 'connector' = 'test-connector', 'test-format.a' = 'false', 'format' = 'test-format' )""", str(table_descriptor))
def test_create_temporary_table_from_descriptor(self): from pyflink.table.schema import Schema t_env = self.t_env catalog = t_env.get_current_catalog() database = t_env.get_current_database() schema = Schema.new_builder().column("f0", DataTypes.INT()).build() t_env.create_temporary_table( "T", TableDescriptor.for_connector("fake") .schema(schema) .option("a", "Test") .build()) self.assertFalse(t_env.get_catalog(catalog).table_exists(ObjectPath(database, "T"))) gateway = get_gateway() catalog_table = CatalogBaseTable( t_env._j_tenv.getCatalogManager() .getTable(gateway.jvm.ObjectIdentifier.of(catalog, database, "T")) .get() .getTable()) self.assertEqual(schema, catalog_table.get_unresolved_schema()) self.assertEqual("fake", catalog_table.get_options().get("connector")) self.assertEqual("Test", catalog_table.get_options().get("a"))
def test_stream_case(self): from pyflink.shell import s_env, st_env, DataTypes from pyflink.table.schema import Schema from pyflink.table.table_descriptor import TableDescriptor, FormatDescriptor # example begin import tempfile import os import shutil sink_path = tempfile.gettempdir() + '/streaming.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table( "stream_sink", TableDescriptor.for_connector("filesystem").schema( Schema.new_builder().column("a", DataTypes.BIGINT()).column( "b", DataTypes.STRING()).column( "c", DataTypes.STRING()).build()).option( "path", sink_path).format( FormatDescriptor.for_format("csv").option( "field-delimiter", ",").build()).build()) t.select(t.a + 1, t.b, t.c).execute_insert("stream_sink").wait() # verify code, do not copy these code to shell.py with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')
def test_table_from_descriptor(self): from pyflink.table.schema import Schema schema = Schema.new_builder().column("f0", DataTypes.INT()).build() descriptor = TableDescriptor.for_connector("fake").schema( schema).build() table = self.t_env.from_descriptor(descriptor) self.assertEqual( schema, Schema(Schema.new_builder()._j_builder.fromResolvedSchema( table._j_table.getResolvedSchema()).build())) table = CatalogBaseTable( self.t_env._j_tenv.getCatalogManager().getTable( table._j_table.getQueryOperation().getTableIdentifier()).get( ).getTable()) self.assertEqual("fake", table.get_options().get("connector"))
def test_format_basic(self): descriptor = TableDescriptor.for_connector("test-connector") \ .schema(Schema.new_builder().build()) \ .format("json") \ .build() self.assertEqual(2, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("json", descriptor.get_options().get("format"))
def test_execute_insert_to_table_descriptor(self): schema = Schema.new_builder() \ .column("f0", DataTypes.STRING()) \ .build() table = self.t_env.from_descriptor( TableDescriptor.for_connector("datagen").option( "number-of-rows", '10').schema(schema).build()) table_result = table.execute_insert( TableDescriptor.for_connector("blackhole").schema(schema).build()) table_result.collect()
def test_options(self): descriptor = TableDescriptor.for_connector("test-connector") \ .schema(Schema.new_builder().build()) \ .option(self.option_a, False) \ .option(self.option_b, 42) \ .option("c", "C") \ .build() self.assertEqual(4, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("false", descriptor.get_options().get("a")) self.assertEqual("42", descriptor.get_options().get("b")) self.assertEqual("C", descriptor.get_options().get("c"))
def test_schema_basic(self): old_schema = Schema.new_builder() \ .from_row_data_type(DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) \ .from_fields(["d", "e"], [DataTypes.STRING(), DataTypes.BOOLEAN()]) \ .build() self.schema = Schema.new_builder() \ .from_schema(old_schema) \ .primary_key_named("primary_constraint", "id") \ .column("id", DataTypes.INT().not_null()) \ .column("counter", DataTypes.INT().not_null()) \ .column("payload", "ROW<name STRING, age INT, flag BOOLEAN>") \ .column_by_metadata("topic", DataTypes.STRING(), None, True) \ .column_by_expression("ts", call_sql("orig_ts - INTERVAL '60' MINUTE")) \ .column_by_metadata("orig_ts", DataTypes.TIMESTAMP(3), "timestamp") \ .watermark("ts", "ts - INTERVAL '5' SECOND") \ .column_by_expression("proctime", "PROCTIME()") \ .build() self.assertEqual( """( `a` TINYINT, `b` SMALLINT, `c` INT, `d` STRING, `e` BOOLEAN, `id` INT NOT NULL, `counter` INT NOT NULL, `payload` [ROW<name STRING, age INT, flag BOOLEAN>], `topic` METADATA VIRTUAL, `ts` AS [orig_ts - INTERVAL '60' MINUTE], `orig_ts` METADATA FROM 'timestamp', `proctime` AS [PROCTIME()], WATERMARK FOR `ts` AS [ts - INTERVAL '5' SECOND], CONSTRAINT `primary_constraint` PRIMARY KEY (`id`) NOT ENFORCED )""", str(self.schema))
def test_create_table_from_descriptor(self): from pyflink.table.schema import Schema catalog = self.t_env.get_current_catalog() database = self.t_env.get_current_database() schema = Schema.new_builder().column("f0", DataTypes.INT()).build() self.t_env.create_table( "T", TableDescriptor.for_connector("fake").schema(schema).option( "a", "Test").build()) object_path = ObjectPath(database, "T") self.assertTrue( self.t_env.get_catalog(catalog).table_exists(object_path)) catalog_table = self.t_env.get_catalog(catalog).get_table(object_path) self.assertEqual(schema, catalog_table.get_unresolved_schema()) self.assertEqual("fake", catalog_table.get_options().get("connector")) self.assertEqual("Test", catalog_table.get_options().get("a"))
def test_statement_set_insert_using_table_descriptor(self): schema = Schema.new_builder() \ .column("f0", DataTypes.INT()) \ .build() source_descriptor = TableDescriptor.for_connector("datagen") \ .schema(schema) \ .option("number-of-rows", '10') \ .build() sink_descriptor = TableDescriptor.for_connector("blackhole") \ .schema(schema) \ .build() self.t_env.create_temporary_table("T", source_descriptor) stmt_set = self.t_env.create_statement_set() stmt_set.add_insert(sink_descriptor, self.t_env.from_path("T")) stmt_set.execute().wait()
def test_format_with_format_descriptor(self): descriptor = TableDescriptor.for_connector("test-connector") \ .schema(Schema.new_builder().build()) \ .format(FormatDescriptor.for_format("test-format") .option(self.option_a, True) .option(self.option_b, 42) .option("c", "C") .build(), self.key_format) \ .build() self.assertEqual(5, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("test-format", descriptor.get_options().get("key.format")) self.assertEqual("true", descriptor.get_options().get("key.test-format.a")) self.assertEqual("42", descriptor.get_options().get("key.test-format.b")) self.assertEqual("C", descriptor.get_options().get("key.test-format.c"))
def test_basic(self): schema = Schema.new_builder() \ .column("f0", DataTypes.STRING()) \ .column("f1", DataTypes.BIGINT()) \ .primary_key("f0") \ .build() descriptor = TableDescriptor.for_connector("test-connector") \ .schema(schema) \ .partitioned_by("f0") \ .comment("Test Comment") \ .build() self.assertIsNotNone(descriptor.get_schema()) self.assertEqual(1, len(descriptor.get_partition_keys())) self.assertEqual("f0", descriptor.get_partition_keys()[0]) self.assertEqual(1, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("Test Comment", descriptor.get_comment())
import os import shutil sink_path = tempfile.gettempdir() + '/batch.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table( "csv_sink", TableDescriptor.for_connector("filesystem").schema( Schema.new_builder().column( "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column( "c", DataTypes.STRING()).build()).option("path", sink_path).format( FormatDescriptor.for_format("csv").option( "field-delimiter", ",").build()).build()) t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait() with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print('pip_test_code.py success!')
import os import shutil sink_path = tempfile.gettempdir() + '/batch.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table("csv_sink", TableDescriptor.for_connector("filesystem") .schema(Schema.new_builder() .column("a", DataTypes.BIGINT()) .column("b", DataTypes.STRING()) .column("c", DataTypes.STRING()) .build()) .option("path", sink_path) .format(FormatDescriptor.for_format("csv") .option("field-delimiter", ",") .build()) .build()) t.select("a + 1, b, c").execute_insert("csv_sink").wait() with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print('pip_test_code.py success!')