Ejemplo n.º 1
0
    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
                                      Types.ROW_NAMED(
                                          ["a", "b", "c"],
                                          [Types.LONG(), Types.INT(), Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps()
            .with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(ds,
                                            Schema.new_builder()
                                                  .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                                  .watermark("rowtime", "SOURCE_WATERMARK()")
                                                  .build())
        self.assertEqual("""(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
                         table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view("t",
                                         ds,
                                         Schema.new_builder()
                                         .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                         .watermark("rowtime", "SOURCE_WATERMARK()")
                                         .build())

        result = self.t_env.execute_sql("SELECT "
                                        "c, SUM(b) "
                                        "FROM t "
                                        "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [item for item in
                               map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Ejemplo n.º 2
0
    def test_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        descriptor = TableDescriptor.for_connector("fake").schema(schema).build()

        table = self.t_env.from_descriptor(descriptor)
        self.assertEqual(schema,
                         Schema(Schema.new_builder()._j_builder
                                .fromResolvedSchema(table._j_table.getResolvedSchema()).build()))
        contextResolvedTable = table._j_table.getQueryOperation().getContextResolvedTable()
        options = contextResolvedTable.getTable().getOptions()
        self.assertEqual("fake", options.get("connector"))
Ejemplo n.º 3
0
    def test_from_and_to_changelog_stream_event_time(self):
        from pyflink.table import Schema

        self.env.set_parallelism(1)
        ds = self.env.from_collection(
            [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
            Types.ROW([Types.LONG(), Types.INT(),
                       Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps(
            ).with_timestamp_assigner(MyTimestampAssigner()))

        changelog_stream = ds.map(lambda t: Row(t.f1, t.f2),
                                  Types.ROW([Types.INT(),
                                             Types.STRING()]))

        # derive physical columns and add a rowtime
        table = self.t_env.from_changelog_stream(
            changelog_stream,
            Schema.new_builder().column_by_metadata(
                "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                    "computed", str(col("f1").upper_case)).watermark(
                        "rowtime", str(source_watermark())).build())

        self.t_env.create_temporary_view("t", table)

        # access and reorder columns
        reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t")

        # write out the rowtime column with fully declared schema
        result = self.t_env.to_changelog_stream(
            reordered,
            Schema.new_builder().column(
                "f1", DataTypes.STRING()).column_by_metadata(
                    "rowtime",
                    DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                        "ignored", str(col("f1").upper_case)).column(
                            "f0", DataTypes.INT()).build())

        # test event time window and field access
        result.key_by(lambda k: k.f1) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(A,47)', '(C,1000)', '(C,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Ejemplo n.º 4
0
    def test_from_data_stream_with_schema(self):
        from pyflink.table import Schema

        ds = self.env.from_collection(
            [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
            type_info=Types.ROW_NAMED(
                ["a", "b", "c"],
                [Types.INT(), Types.STRING(),
                 Types.STRING()]))

        table = self.t_env.from_data_stream(
            ds,
            Schema.new_builder().column("a", DataTypes.INT()).column(
                "b", DataTypes.STRING()).column("c",
                                                DataTypes.STRING()).build())
        result = table.execute()
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [
                item for item in map(
                    str, [Row(1, 'Hi', 'Hello'),
                          Row(2, 'Hello', 'Hi')])
            ]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)
Ejemplo n.º 5
0
    def test_to_string(self):
        schema = Schema.new_builder().column("f0", DataTypes.STRING()).build()
        format_descriptor = FormatDescriptor \
            .for_format("test-format") \
            .option(self.option_a, False) \
            .build()
        table_descriptor = TableDescriptor.for_connector("test-connector") \
            .schema(schema) \
            .partitioned_by("f0") \
            .option(self.option_a, True) \
            .format(format_descriptor) \
            .comment("Test Comment") \
            .build()
        self.assertEqual("test-format[{a=false}]", str(format_descriptor))
        self.assertEqual(
            """(
  `f0` STRING
)
COMMENT 'Test Comment'
PARTITIONED BY (`f0`)
WITH (
  'a' = 'true',
  'connector' = 'test-connector',
  'test-format.a' = 'false',
  'format' = 'test-format'
)""", str(table_descriptor))
Ejemplo n.º 6
0
    def test_create_temporary_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        t_env = self.t_env
        catalog = t_env.get_current_catalog()
        database = t_env.get_current_database()
        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        t_env.create_temporary_table(
            "T",
            TableDescriptor.for_connector("fake")
             .schema(schema)
             .option("a", "Test")
             .build())

        self.assertFalse(t_env.get_catalog(catalog).table_exists(ObjectPath(database, "T")))
        gateway = get_gateway()

        catalog_table = CatalogBaseTable(
            t_env._j_tenv.getCatalogManager()
                 .getTable(gateway.jvm.ObjectIdentifier.of(catalog, database, "T"))
                 .get()
                 .getTable())
        self.assertEqual(schema, catalog_table.get_unresolved_schema())
        self.assertEqual("fake", catalog_table.get_options().get("connector"))
        self.assertEqual("Test", catalog_table.get_options().get("a"))
Ejemplo n.º 7
0
    def test_stream_case(self):
        from pyflink.shell import s_env, st_env, DataTypes
        from pyflink.table.schema import Schema
        from pyflink.table.table_descriptor import TableDescriptor, FormatDescriptor
        # example begin

        import tempfile
        import os
        import shutil
        sink_path = tempfile.gettempdir() + '/streaming.csv'
        if os.path.exists(sink_path):
            if os.path.isfile(sink_path):
                os.remove(sink_path)
            else:
                shutil.rmtree(sink_path)
        s_env.set_parallelism(1)
        t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                                 ['a', 'b', 'c'])

        st_env.create_temporary_table(
            "stream_sink",
            TableDescriptor.for_connector("filesystem").schema(
                Schema.new_builder().column("a", DataTypes.BIGINT()).column(
                    "b", DataTypes.STRING()).column(
                        "c", DataTypes.STRING()).build()).option(
                            "path", sink_path).format(
                                FormatDescriptor.for_format("csv").option(
                                    "field-delimiter", ",").build()).build())

        t.select(t.a + 1, t.b, t.c).execute_insert("stream_sink").wait()

        # verify code, do not copy these code to shell.py
        with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
            lines = f.read()
            self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')
Ejemplo n.º 8
0
    def test_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        descriptor = TableDescriptor.for_connector("fake").schema(
            schema).build()

        table = self.t_env.from_descriptor(descriptor)
        self.assertEqual(
            schema,
            Schema(Schema.new_builder()._j_builder.fromResolvedSchema(
                table._j_table.getResolvedSchema()).build()))
        table = CatalogBaseTable(
            self.t_env._j_tenv.getCatalogManager().getTable(
                table._j_table.getQueryOperation().getTableIdentifier()).get(
                ).getTable())
        self.assertEqual("fake", table.get_options().get("connector"))
Ejemplo n.º 9
0
 def test_format_basic(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .format("json") \
         .build()
     self.assertEqual(2, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("json", descriptor.get_options().get("format"))
Ejemplo n.º 10
0
 def test_execute_insert_to_table_descriptor(self):
     schema = Schema.new_builder() \
         .column("f0", DataTypes.STRING()) \
         .build()
     table = self.t_env.from_descriptor(
         TableDescriptor.for_connector("datagen").option(
             "number-of-rows", '10').schema(schema).build())
     table_result = table.execute_insert(
         TableDescriptor.for_connector("blackhole").schema(schema).build())
     table_result.collect()
Ejemplo n.º 11
0
 def test_options(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .option(self.option_a, False) \
         .option(self.option_b, 42) \
         .option("c", "C") \
         .build()
     self.assertEqual(4, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("false", descriptor.get_options().get("a"))
     self.assertEqual("42", descriptor.get_options().get("b"))
     self.assertEqual("C", descriptor.get_options().get("c"))
Ejemplo n.º 12
0
    def test_schema_basic(self):
        old_schema = Schema.new_builder() \
            .from_row_data_type(DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())])) \
            .from_fields(["d", "e"], [DataTypes.STRING(), DataTypes.BOOLEAN()]) \
            .build()
        self.schema = Schema.new_builder() \
            .from_schema(old_schema) \
            .primary_key_named("primary_constraint", "id") \
            .column("id", DataTypes.INT().not_null()) \
            .column("counter", DataTypes.INT().not_null()) \
            .column("payload", "ROW<name STRING, age INT, flag BOOLEAN>") \
            .column_by_metadata("topic", DataTypes.STRING(), None, True) \
            .column_by_expression("ts", call_sql("orig_ts - INTERVAL '60' MINUTE")) \
            .column_by_metadata("orig_ts", DataTypes.TIMESTAMP(3), "timestamp") \
            .watermark("ts", "ts - INTERVAL '5' SECOND") \
            .column_by_expression("proctime", "PROCTIME()") \
            .build()
        self.assertEqual(
            """(
  `a` TINYINT,
  `b` SMALLINT,
  `c` INT,
  `d` STRING,
  `e` BOOLEAN,
  `id` INT NOT NULL,
  `counter` INT NOT NULL,
  `payload` [ROW<name STRING, age INT, flag BOOLEAN>],
  `topic` METADATA VIRTUAL,
  `ts` AS [orig_ts - INTERVAL '60' MINUTE],
  `orig_ts` METADATA FROM 'timestamp',
  `proctime` AS [PROCTIME()],
  WATERMARK FOR `ts` AS [ts - INTERVAL '5' SECOND],
  CONSTRAINT `primary_constraint` PRIMARY KEY (`id`) NOT ENFORCED
)""", str(self.schema))
Ejemplo n.º 13
0
    def test_create_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        catalog = self.t_env.get_current_catalog()
        database = self.t_env.get_current_database()
        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        self.t_env.create_table(
            "T",
            TableDescriptor.for_connector("fake").schema(schema).option(
                "a", "Test").build())
        object_path = ObjectPath(database, "T")
        self.assertTrue(
            self.t_env.get_catalog(catalog).table_exists(object_path))

        catalog_table = self.t_env.get_catalog(catalog).get_table(object_path)
        self.assertEqual(schema, catalog_table.get_unresolved_schema())
        self.assertEqual("fake", catalog_table.get_options().get("connector"))
        self.assertEqual("Test", catalog_table.get_options().get("a"))
Ejemplo n.º 14
0
    def test_statement_set_insert_using_table_descriptor(self):
        schema = Schema.new_builder() \
            .column("f0", DataTypes.INT()) \
            .build()

        source_descriptor = TableDescriptor.for_connector("datagen") \
            .schema(schema) \
            .option("number-of-rows", '10') \
            .build()

        sink_descriptor = TableDescriptor.for_connector("blackhole") \
            .schema(schema) \
            .build()

        self.t_env.create_temporary_table("T", source_descriptor)

        stmt_set = self.t_env.create_statement_set()
        stmt_set.add_insert(sink_descriptor, self.t_env.from_path("T"))

        stmt_set.execute().wait()
Ejemplo n.º 15
0
 def test_format_with_format_descriptor(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .format(FormatDescriptor.for_format("test-format")
                 .option(self.option_a, True)
                 .option(self.option_b, 42)
                 .option("c", "C")
                 .build(),
                 self.key_format) \
         .build()
     self.assertEqual(5, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("test-format",
                      descriptor.get_options().get("key.format"))
     self.assertEqual("true",
                      descriptor.get_options().get("key.test-format.a"))
     self.assertEqual("42",
                      descriptor.get_options().get("key.test-format.b"))
     self.assertEqual("C",
                      descriptor.get_options().get("key.test-format.c"))
Ejemplo n.º 16
0
    def test_basic(self):
        schema = Schema.new_builder() \
            .column("f0", DataTypes.STRING()) \
            .column("f1", DataTypes.BIGINT()) \
            .primary_key("f0") \
            .build()

        descriptor = TableDescriptor.for_connector("test-connector") \
            .schema(schema) \
            .partitioned_by("f0") \
            .comment("Test Comment") \
            .build()

        self.assertIsNotNone(descriptor.get_schema())

        self.assertEqual(1, len(descriptor.get_partition_keys()))
        self.assertEqual("f0", descriptor.get_partition_keys()[0])

        self.assertEqual(1, len(descriptor.get_options()))
        self.assertEqual("test-connector",
                         descriptor.get_options().get("connector"))

        self.assertEqual("Test Comment", descriptor.get_comment())
Ejemplo n.º 17
0
import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                         ['a', 'b', 'c'])

st_env.create_temporary_table(
    "csv_sink",
    TableDescriptor.for_connector("filesystem").schema(
        Schema.new_builder().column(
            "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column(
                "c",
                DataTypes.STRING()).build()).option("path", sink_path).format(
                    FormatDescriptor.for_format("csv").option(
                        "field-delimiter", ",").build()).build())

t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')
Ejemplo n.º 18
0
import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c'])

st_env.create_temporary_table("csv_sink", TableDescriptor.for_connector("filesystem")
    .schema(Schema.new_builder()
        .column("a", DataTypes.BIGINT())
        .column("b", DataTypes.STRING())
        .column("c", DataTypes.STRING())
        .build())
    .option("path", sink_path)
    .format(FormatDescriptor.for_format("csv")
        .option("field-delimiter", ",")
        .build())
    .build())

t.select("a + 1, b, c").execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')