def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps() .with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream(ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) self.assertEqual("""( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view("t", ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) result = self.t_env.execute_sql("SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [item for item in map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_table_from_descriptor(self): from pyflink.table.schema import Schema schema = Schema.new_builder().column("f0", DataTypes.INT()).build() descriptor = TableDescriptor.for_connector("fake").schema(schema).build() table = self.t_env.from_descriptor(descriptor) self.assertEqual(schema, Schema(Schema.new_builder()._j_builder .fromResolvedSchema(table._j_table.getResolvedSchema()).build())) contextResolvedTable = table._j_table.getQueryOperation().getContextResolvedTable() options = contextResolvedTable.getTable().getOptions() self.assertEqual("fake", options.get("connector"))
def test_from_and_to_changelog_stream_event_time(self): from pyflink.table import Schema self.env.set_parallelism(1) ds = self.env.from_collection( [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW([Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner())) changelog_stream = ds.map(lambda t: Row(t.f1, t.f2), Types.ROW([Types.INT(), Types.STRING()])) # derive physical columns and add a rowtime table = self.t_env.from_changelog_stream( changelog_stream, Schema.new_builder().column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "computed", str(col("f1").upper_case)).watermark( "rowtime", str(source_watermark())).build()) self.t_env.create_temporary_view("t", table) # access and reorder columns reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t") # write out the rowtime column with fully declared schema result = self.t_env.to_changelog_stream( reordered, Schema.new_builder().column( "f1", DataTypes.STRING()).column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "ignored", str(col("f1").upper_case)).column( "f0", DataTypes.INT()).build()) # test event time window and field access result.key_by(lambda k: k.f1) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(A,47)', '(C,1000)', '(C,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_from_data_stream_with_schema(self): from pyflink.table import Schema ds = self.env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW_NAMED( ["a", "b", "c"], [Types.INT(), Types.STRING(), Types.STRING()])) table = self.t_env.from_data_stream( ds, Schema.new_builder().column("a", DataTypes.INT()).column( "b", DataTypes.STRING()).column("c", DataTypes.STRING()).build()) result = table.execute() with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [ item for item in map( str, [Row(1, 'Hi', 'Hello'), Row(2, 'Hello', 'Hi')]) ] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result)
def test_to_string(self): schema = Schema.new_builder().column("f0", DataTypes.STRING()).build() format_descriptor = FormatDescriptor \ .for_format("test-format") \ .option(self.option_a, False) \ .build() table_descriptor = TableDescriptor.for_connector("test-connector") \ .schema(schema) \ .partitioned_by("f0") \ .option(self.option_a, True) \ .format(format_descriptor) \ .comment("Test Comment") \ .build() self.assertEqual("test-format[{a=false}]", str(format_descriptor)) self.assertEqual( """( `f0` STRING ) COMMENT 'Test Comment' PARTITIONED BY (`f0`) WITH ( 'a' = 'true', 'connector' = 'test-connector', 'test-format.a' = 'false', 'format' = 'test-format' )""", str(table_descriptor))
def test_create_temporary_table_from_descriptor(self): from pyflink.table.schema import Schema t_env = self.t_env catalog = t_env.get_current_catalog() database = t_env.get_current_database() schema = Schema.new_builder().column("f0", DataTypes.INT()).build() t_env.create_temporary_table( "T", TableDescriptor.for_connector("fake") .schema(schema) .option("a", "Test") .build()) self.assertFalse(t_env.get_catalog(catalog).table_exists(ObjectPath(database, "T"))) gateway = get_gateway() catalog_table = CatalogBaseTable( t_env._j_tenv.getCatalogManager() .getTable(gateway.jvm.ObjectIdentifier.of(catalog, database, "T")) .get() .getTable()) self.assertEqual(schema, catalog_table.get_unresolved_schema()) self.assertEqual("fake", catalog_table.get_options().get("connector")) self.assertEqual("Test", catalog_table.get_options().get("a"))
def test_stream_case(self): from pyflink.shell import s_env, st_env, DataTypes from pyflink.table.schema import Schema from pyflink.table.table_descriptor import TableDescriptor, FormatDescriptor # example begin import tempfile import os import shutil sink_path = tempfile.gettempdir() + '/streaming.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table( "stream_sink", TableDescriptor.for_connector("filesystem").schema( Schema.new_builder().column("a", DataTypes.BIGINT()).column( "b", DataTypes.STRING()).column( "c", DataTypes.STRING()).build()).option( "path", sink_path).format( FormatDescriptor.for_format("csv").option( "field-delimiter", ",").build()).build()) t.select(t.a + 1, t.b, t.c).execute_insert("stream_sink").wait() # verify code, do not copy these code to shell.py with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')
def get_unresolved_schema(self) -> Schema: """ Returns the schema of the table or view. The schema can reference objects from other catalogs and will be resolved and validated by the framework when accessing the table or view. """ return Schema(self._j_catalog_base_table.getUnresolvedSchema())
def test_table_from_descriptor(self): from pyflink.table.schema import Schema schema = Schema.new_builder().column("f0", DataTypes.INT()).build() descriptor = TableDescriptor.for_connector("fake").schema( schema).build() table = self.t_env.from_descriptor(descriptor) self.assertEqual( schema, Schema(Schema.new_builder()._j_builder.fromResolvedSchema( table._j_table.getResolvedSchema()).build())) table = CatalogBaseTable( self.t_env._j_tenv.getCatalogManager().getTable( table._j_table.getQueryOperation().getTableIdentifier()).get( ).getTable()) self.assertEqual("fake", table.get_options().get("connector"))
def test_format_basic(self): descriptor = TableDescriptor.for_connector("test-connector") \ .schema(Schema.new_builder().build()) \ .format("json") \ .build() self.assertEqual(2, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("json", descriptor.get_options().get("format"))
def test_execute_insert_to_table_descriptor(self): schema = Schema.new_builder() \ .column("f0", DataTypes.STRING()) \ .build() table = self.t_env.from_descriptor( TableDescriptor.for_connector("datagen").option( "number-of-rows", '10').schema(schema).build()) table_result = table.execute_insert( TableDescriptor.for_connector("blackhole").schema(schema).build()) table_result.collect()
def test_options(self): descriptor = TableDescriptor.for_connector("test-connector") \ .schema(Schema.new_builder().build()) \ .option(self.option_a, False) \ .option(self.option_b, 42) \ .option("c", "C") \ .build() self.assertEqual(4, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("false", descriptor.get_options().get("a")) self.assertEqual("42", descriptor.get_options().get("b")) self.assertEqual("C", descriptor.get_options().get("c"))
def test_schema_basic(self): old_schema = Schema.new_builder() \ .from_row_data_type(DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) \ .from_fields(["d", "e"], [DataTypes.STRING(), DataTypes.BOOLEAN()]) \ .build() self.schema = Schema.new_builder() \ .from_schema(old_schema) \ .primary_key_named("primary_constraint", "id") \ .column("id", DataTypes.INT().not_null()) \ .column("counter", DataTypes.INT().not_null()) \ .column("payload", "ROW<name STRING, age INT, flag BOOLEAN>") \ .column_by_metadata("topic", DataTypes.STRING(), None, True) \ .column_by_expression("ts", call_sql("orig_ts - INTERVAL '60' MINUTE")) \ .column_by_metadata("orig_ts", DataTypes.TIMESTAMP(3), "timestamp") \ .watermark("ts", "ts - INTERVAL '5' SECOND") \ .column_by_expression("proctime", "PROCTIME()") \ .build() self.assertEqual( """( `a` TINYINT, `b` SMALLINT, `c` INT, `d` STRING, `e` BOOLEAN, `id` INT NOT NULL, `counter` INT NOT NULL, `payload` [ROW<name STRING, age INT, flag BOOLEAN>], `topic` METADATA VIRTUAL, `ts` AS [orig_ts - INTERVAL '60' MINUTE], `orig_ts` METADATA FROM 'timestamp', `proctime` AS [PROCTIME()], WATERMARK FOR `ts` AS [ts - INTERVAL '5' SECOND], CONSTRAINT `primary_constraint` PRIMARY KEY (`id`) NOT ENFORCED )""", str(self.schema))
def test_create_table_from_descriptor(self): from pyflink.table.schema import Schema catalog = self.t_env.get_current_catalog() database = self.t_env.get_current_database() schema = Schema.new_builder().column("f0", DataTypes.INT()).build() self.t_env.create_table( "T", TableDescriptor.for_connector("fake").schema(schema).option( "a", "Test").build()) object_path = ObjectPath(database, "T") self.assertTrue( self.t_env.get_catalog(catalog).table_exists(object_path)) catalog_table = self.t_env.get_catalog(catalog).get_table(object_path) self.assertEqual(schema, catalog_table.get_unresolved_schema()) self.assertEqual("fake", catalog_table.get_options().get("connector")) self.assertEqual("Test", catalog_table.get_options().get("a"))
def test_statement_set_insert_using_table_descriptor(self): schema = Schema.new_builder() \ .column("f0", DataTypes.INT()) \ .build() source_descriptor = TableDescriptor.for_connector("datagen") \ .schema(schema) \ .option("number-of-rows", '10') \ .build() sink_descriptor = TableDescriptor.for_connector("blackhole") \ .schema(schema) \ .build() self.t_env.create_temporary_table("T", source_descriptor) stmt_set = self.t_env.create_statement_set() stmt_set.add_insert(sink_descriptor, self.t_env.from_path("T")) stmt_set.execute().wait()
def test_format_with_format_descriptor(self): descriptor = TableDescriptor.for_connector("test-connector") \ .schema(Schema.new_builder().build()) \ .format(FormatDescriptor.for_format("test-format") .option(self.option_a, True) .option(self.option_b, 42) .option("c", "C") .build(), self.key_format) \ .build() self.assertEqual(5, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("test-format", descriptor.get_options().get("key.format")) self.assertEqual("true", descriptor.get_options().get("key.test-format.a")) self.assertEqual("42", descriptor.get_options().get("key.test-format.b")) self.assertEqual("C", descriptor.get_options().get("key.test-format.c"))
def test_basic(self): schema = Schema.new_builder() \ .column("f0", DataTypes.STRING()) \ .column("f1", DataTypes.BIGINT()) \ .primary_key("f0") \ .build() descriptor = TableDescriptor.for_connector("test-connector") \ .schema(schema) \ .partitioned_by("f0") \ .comment("Test Comment") \ .build() self.assertIsNotNone(descriptor.get_schema()) self.assertEqual(1, len(descriptor.get_partition_keys())) self.assertEqual("f0", descriptor.get_partition_keys()[0]) self.assertEqual(1, len(descriptor.get_options())) self.assertEqual("test-connector", descriptor.get_options().get("connector")) self.assertEqual("Test Comment", descriptor.get_comment())
import os import shutil sink_path = tempfile.gettempdir() + '/batch.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table("csv_sink", TableDescriptor.for_connector("filesystem") .schema(Schema.new_builder() .column("a", DataTypes.BIGINT()) .column("b", DataTypes.STRING()) .column("c", DataTypes.STRING()) .build()) .option("path", sink_path) .format(FormatDescriptor.for_format("csv") .option("field-delimiter", ",") .build()) .build()) t.select("a + 1, b, c").execute_insert("csv_sink").wait() with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print('pip_test_code.py success!')
def get_schema(self) -> Optional[Schema]: j_schema = self._j_table_descriptor.getSchema() if j_schema.isPresent(): return Schema(j_schema.get()) else: return None
import os import shutil sink_path = tempfile.gettempdir() + '/batch.csv' if os.path.exists(sink_path): if os.path.isfile(sink_path): os.remove(sink_path) else: shutil.rmtree(sink_path) s_env.set_parallelism(1) t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) st_env.create_temporary_table( "csv_sink", TableDescriptor.for_connector("filesystem").schema( Schema.new_builder().column( "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column( "c", DataTypes.STRING()).build()).option("path", sink_path).format( FormatDescriptor.for_format("csv").option( "field-delimiter", ",").build()).build()) t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait() with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print('pip_test_code.py success!')