Esempio n. 1
0
    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
                                      Types.ROW_NAMED(
                                          ["a", "b", "c"],
                                          [Types.LONG(), Types.INT(), Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps()
            .with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(ds,
                                            Schema.new_builder()
                                                  .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                                  .watermark("rowtime", "SOURCE_WATERMARK()")
                                                  .build())
        self.assertEqual("""(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
                         table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view("t",
                                         ds,
                                         Schema.new_builder()
                                         .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                         .watermark("rowtime", "SOURCE_WATERMARK()")
                                         .build())

        result = self.t_env.execute_sql("SELECT "
                                        "c, SUM(b) "
                                        "FROM t "
                                        "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [item for item in
                               map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Esempio n. 2
0
    def test_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        descriptor = TableDescriptor.for_connector("fake").schema(schema).build()

        table = self.t_env.from_descriptor(descriptor)
        self.assertEqual(schema,
                         Schema(Schema.new_builder()._j_builder
                                .fromResolvedSchema(table._j_table.getResolvedSchema()).build()))
        contextResolvedTable = table._j_table.getQueryOperation().getContextResolvedTable()
        options = contextResolvedTable.getTable().getOptions()
        self.assertEqual("fake", options.get("connector"))
Esempio n. 3
0
    def test_from_and_to_changelog_stream_event_time(self):
        from pyflink.table import Schema

        self.env.set_parallelism(1)
        ds = self.env.from_collection(
            [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
            Types.ROW([Types.LONG(), Types.INT(),
                       Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps(
            ).with_timestamp_assigner(MyTimestampAssigner()))

        changelog_stream = ds.map(lambda t: Row(t.f1, t.f2),
                                  Types.ROW([Types.INT(),
                                             Types.STRING()]))

        # derive physical columns and add a rowtime
        table = self.t_env.from_changelog_stream(
            changelog_stream,
            Schema.new_builder().column_by_metadata(
                "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                    "computed", str(col("f1").upper_case)).watermark(
                        "rowtime", str(source_watermark())).build())

        self.t_env.create_temporary_view("t", table)

        # access and reorder columns
        reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t")

        # write out the rowtime column with fully declared schema
        result = self.t_env.to_changelog_stream(
            reordered,
            Schema.new_builder().column(
                "f1", DataTypes.STRING()).column_by_metadata(
                    "rowtime",
                    DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                        "ignored", str(col("f1").upper_case)).column(
                            "f0", DataTypes.INT()).build())

        # test event time window and field access
        result.key_by(lambda k: k.f1) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(A,47)', '(C,1000)', '(C,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Esempio n. 4
0
    def test_from_data_stream_with_schema(self):
        from pyflink.table import Schema

        ds = self.env.from_collection(
            [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
            type_info=Types.ROW_NAMED(
                ["a", "b", "c"],
                [Types.INT(), Types.STRING(),
                 Types.STRING()]))

        table = self.t_env.from_data_stream(
            ds,
            Schema.new_builder().column("a", DataTypes.INT()).column(
                "b", DataTypes.STRING()).column("c",
                                                DataTypes.STRING()).build())
        result = table.execute()
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [
                item for item in map(
                    str, [Row(1, 'Hi', 'Hello'),
                          Row(2, 'Hello', 'Hi')])
            ]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)
Esempio n. 5
0
    def test_to_string(self):
        schema = Schema.new_builder().column("f0", DataTypes.STRING()).build()
        format_descriptor = FormatDescriptor \
            .for_format("test-format") \
            .option(self.option_a, False) \
            .build()
        table_descriptor = TableDescriptor.for_connector("test-connector") \
            .schema(schema) \
            .partitioned_by("f0") \
            .option(self.option_a, True) \
            .format(format_descriptor) \
            .comment("Test Comment") \
            .build()
        self.assertEqual("test-format[{a=false}]", str(format_descriptor))
        self.assertEqual(
            """(
  `f0` STRING
)
COMMENT 'Test Comment'
PARTITIONED BY (`f0`)
WITH (
  'a' = 'true',
  'connector' = 'test-connector',
  'test-format.a' = 'false',
  'format' = 'test-format'
)""", str(table_descriptor))
Esempio n. 6
0
    def test_create_temporary_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        t_env = self.t_env
        catalog = t_env.get_current_catalog()
        database = t_env.get_current_database()
        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        t_env.create_temporary_table(
            "T",
            TableDescriptor.for_connector("fake")
             .schema(schema)
             .option("a", "Test")
             .build())

        self.assertFalse(t_env.get_catalog(catalog).table_exists(ObjectPath(database, "T")))
        gateway = get_gateway()

        catalog_table = CatalogBaseTable(
            t_env._j_tenv.getCatalogManager()
                 .getTable(gateway.jvm.ObjectIdentifier.of(catalog, database, "T"))
                 .get()
                 .getTable())
        self.assertEqual(schema, catalog_table.get_unresolved_schema())
        self.assertEqual("fake", catalog_table.get_options().get("connector"))
        self.assertEqual("Test", catalog_table.get_options().get("a"))
Esempio n. 7
0
    def test_stream_case(self):
        from pyflink.shell import s_env, st_env, DataTypes
        from pyflink.table.schema import Schema
        from pyflink.table.table_descriptor import TableDescriptor, FormatDescriptor
        # example begin

        import tempfile
        import os
        import shutil
        sink_path = tempfile.gettempdir() + '/streaming.csv'
        if os.path.exists(sink_path):
            if os.path.isfile(sink_path):
                os.remove(sink_path)
            else:
                shutil.rmtree(sink_path)
        s_env.set_parallelism(1)
        t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                                 ['a', 'b', 'c'])

        st_env.create_temporary_table(
            "stream_sink",
            TableDescriptor.for_connector("filesystem").schema(
                Schema.new_builder().column("a", DataTypes.BIGINT()).column(
                    "b", DataTypes.STRING()).column(
                        "c", DataTypes.STRING()).build()).option(
                            "path", sink_path).format(
                                FormatDescriptor.for_format("csv").option(
                                    "field-delimiter", ",").build()).build())

        t.select(t.a + 1, t.b, t.c).execute_insert("stream_sink").wait()

        # verify code, do not copy these code to shell.py
        with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
            lines = f.read()
            self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')
Esempio n. 8
0
    def get_unresolved_schema(self) -> Schema:
        """
        Returns the schema of the table or view.

        The schema can reference objects from other catalogs and will be resolved and validated by
        the framework when accessing the table or view.
        """
        return Schema(self._j_catalog_base_table.getUnresolvedSchema())
    def test_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        descriptor = TableDescriptor.for_connector("fake").schema(
            schema).build()

        table = self.t_env.from_descriptor(descriptor)
        self.assertEqual(
            schema,
            Schema(Schema.new_builder()._j_builder.fromResolvedSchema(
                table._j_table.getResolvedSchema()).build()))
        table = CatalogBaseTable(
            self.t_env._j_tenv.getCatalogManager().getTable(
                table._j_table.getQueryOperation().getTableIdentifier()).get(
                ).getTable())
        self.assertEqual("fake", table.get_options().get("connector"))
Esempio n. 10
0
 def test_format_basic(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .format("json") \
         .build()
     self.assertEqual(2, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("json", descriptor.get_options().get("format"))
Esempio n. 11
0
 def test_execute_insert_to_table_descriptor(self):
     schema = Schema.new_builder() \
         .column("f0", DataTypes.STRING()) \
         .build()
     table = self.t_env.from_descriptor(
         TableDescriptor.for_connector("datagen").option(
             "number-of-rows", '10').schema(schema).build())
     table_result = table.execute_insert(
         TableDescriptor.for_connector("blackhole").schema(schema).build())
     table_result.collect()
Esempio n. 12
0
 def test_options(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .option(self.option_a, False) \
         .option(self.option_b, 42) \
         .option("c", "C") \
         .build()
     self.assertEqual(4, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("false", descriptor.get_options().get("a"))
     self.assertEqual("42", descriptor.get_options().get("b"))
     self.assertEqual("C", descriptor.get_options().get("c"))
Esempio n. 13
0
    def test_schema_basic(self):
        old_schema = Schema.new_builder() \
            .from_row_data_type(DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())])) \
            .from_fields(["d", "e"], [DataTypes.STRING(), DataTypes.BOOLEAN()]) \
            .build()
        self.schema = Schema.new_builder() \
            .from_schema(old_schema) \
            .primary_key_named("primary_constraint", "id") \
            .column("id", DataTypes.INT().not_null()) \
            .column("counter", DataTypes.INT().not_null()) \
            .column("payload", "ROW<name STRING, age INT, flag BOOLEAN>") \
            .column_by_metadata("topic", DataTypes.STRING(), None, True) \
            .column_by_expression("ts", call_sql("orig_ts - INTERVAL '60' MINUTE")) \
            .column_by_metadata("orig_ts", DataTypes.TIMESTAMP(3), "timestamp") \
            .watermark("ts", "ts - INTERVAL '5' SECOND") \
            .column_by_expression("proctime", "PROCTIME()") \
            .build()
        self.assertEqual(
            """(
  `a` TINYINT,
  `b` SMALLINT,
  `c` INT,
  `d` STRING,
  `e` BOOLEAN,
  `id` INT NOT NULL,
  `counter` INT NOT NULL,
  `payload` [ROW<name STRING, age INT, flag BOOLEAN>],
  `topic` METADATA VIRTUAL,
  `ts` AS [orig_ts - INTERVAL '60' MINUTE],
  `orig_ts` METADATA FROM 'timestamp',
  `proctime` AS [PROCTIME()],
  WATERMARK FOR `ts` AS [ts - INTERVAL '5' SECOND],
  CONSTRAINT `primary_constraint` PRIMARY KEY (`id`) NOT ENFORCED
)""", str(self.schema))
Esempio n. 14
0
    def test_create_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        catalog = self.t_env.get_current_catalog()
        database = self.t_env.get_current_database()
        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        self.t_env.create_table(
            "T",
            TableDescriptor.for_connector("fake").schema(schema).option(
                "a", "Test").build())
        object_path = ObjectPath(database, "T")
        self.assertTrue(
            self.t_env.get_catalog(catalog).table_exists(object_path))

        catalog_table = self.t_env.get_catalog(catalog).get_table(object_path)
        self.assertEqual(schema, catalog_table.get_unresolved_schema())
        self.assertEqual("fake", catalog_table.get_options().get("connector"))
        self.assertEqual("Test", catalog_table.get_options().get("a"))
Esempio n. 15
0
    def test_statement_set_insert_using_table_descriptor(self):
        schema = Schema.new_builder() \
            .column("f0", DataTypes.INT()) \
            .build()

        source_descriptor = TableDescriptor.for_connector("datagen") \
            .schema(schema) \
            .option("number-of-rows", '10') \
            .build()

        sink_descriptor = TableDescriptor.for_connector("blackhole") \
            .schema(schema) \
            .build()

        self.t_env.create_temporary_table("T", source_descriptor)

        stmt_set = self.t_env.create_statement_set()
        stmt_set.add_insert(sink_descriptor, self.t_env.from_path("T"))

        stmt_set.execute().wait()
Esempio n. 16
0
 def test_format_with_format_descriptor(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .format(FormatDescriptor.for_format("test-format")
                 .option(self.option_a, True)
                 .option(self.option_b, 42)
                 .option("c", "C")
                 .build(),
                 self.key_format) \
         .build()
     self.assertEqual(5, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("test-format",
                      descriptor.get_options().get("key.format"))
     self.assertEqual("true",
                      descriptor.get_options().get("key.test-format.a"))
     self.assertEqual("42",
                      descriptor.get_options().get("key.test-format.b"))
     self.assertEqual("C",
                      descriptor.get_options().get("key.test-format.c"))
Esempio n. 17
0
    def test_basic(self):
        schema = Schema.new_builder() \
            .column("f0", DataTypes.STRING()) \
            .column("f1", DataTypes.BIGINT()) \
            .primary_key("f0") \
            .build()

        descriptor = TableDescriptor.for_connector("test-connector") \
            .schema(schema) \
            .partitioned_by("f0") \
            .comment("Test Comment") \
            .build()

        self.assertIsNotNone(descriptor.get_schema())

        self.assertEqual(1, len(descriptor.get_partition_keys()))
        self.assertEqual("f0", descriptor.get_partition_keys()[0])

        self.assertEqual(1, len(descriptor.get_options()))
        self.assertEqual("test-connector",
                         descriptor.get_options().get("connector"))

        self.assertEqual("Test Comment", descriptor.get_comment())
Esempio n. 18
0
import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c'])

st_env.create_temporary_table("csv_sink", TableDescriptor.for_connector("filesystem")
    .schema(Schema.new_builder()
        .column("a", DataTypes.BIGINT())
        .column("b", DataTypes.STRING())
        .column("c", DataTypes.STRING())
        .build())
    .option("path", sink_path)
    .format(FormatDescriptor.for_format("csv")
        .option("field-delimiter", ",")
        .build())
    .build())

t.select("a + 1, b, c").execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')
Esempio n. 19
0
 def get_schema(self) -> Optional[Schema]:
     j_schema = self._j_table_descriptor.getSchema()
     if j_schema.isPresent():
         return Schema(j_schema.get())
     else:
         return None
Esempio n. 20
0
import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                         ['a', 'b', 'c'])

st_env.create_temporary_table(
    "csv_sink",
    TableDescriptor.for_connector("filesystem").schema(
        Schema.new_builder().column(
            "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column(
                "c",
                DataTypes.STRING()).build()).option("path", sink_path).format(
                    FormatDescriptor.for_format("csv").option(
                        "field-delimiter", ",").build()).build())

t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')