Esempi in Python per Schema, esempi in Python per pyflink.table.schema.Schema

Esempio n. 1

0

Mostra file

    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
                                      Types.ROW_NAMED(
                                          ["a", "b", "c"],
                                          [Types.LONG(), Types.INT(), Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps()
            .with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(ds,
                                            Schema.new_builder()
                                                  .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                                  .watermark("rowtime", "SOURCE_WATERMARK()")
                                                  .build())
        self.assertEqual("""(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
                         table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view("t",
                                         ds,
                                         Schema.new_builder()
                                         .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                         .watermark("rowtime", "SOURCE_WATERMARK()")
                                         .build())

        result = self.t_env.execute_sql("SELECT "
                                        "c, SUM(b) "
                                        "FROM t "
                                        "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [item for item in
                               map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)

Esempio n. 2

0

Mostra file

    def test_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        descriptor = TableDescriptor.for_connector("fake").schema(schema).build()

        table = self.t_env.from_descriptor(descriptor)
        self.assertEqual(schema,
                         Schema(Schema.new_builder()._j_builder
                                .fromResolvedSchema(table._j_table.getResolvedSchema()).build()))
        contextResolvedTable = table._j_table.getQueryOperation().getContextResolvedTable()
        options = contextResolvedTable.getTable().getOptions()
        self.assertEqual("fake", options.get("connector"))

Esempio n. 3

0

Mostra file

    def test_from_and_to_changelog_stream_event_time(self):
        from pyflink.table import Schema

        self.env.set_parallelism(1)
        ds = self.env.from_collection(
            [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
            Types.ROW([Types.LONG(), Types.INT(),
                       Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps(
            ).with_timestamp_assigner(MyTimestampAssigner()))

        changelog_stream = ds.map(lambda t: Row(t.f1, t.f2),
                                  Types.ROW([Types.INT(),
                                             Types.STRING()]))

        # derive physical columns and add a rowtime
        table = self.t_env.from_changelog_stream(
            changelog_stream,
            Schema.new_builder().column_by_metadata(
                "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                    "computed", str(col("f1").upper_case)).watermark(
                        "rowtime", str(source_watermark())).build())

        self.t_env.create_temporary_view("t", table)

        # access and reorder columns
        reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t")

        # write out the rowtime column with fully declared schema
        result = self.t_env.to_changelog_stream(
            reordered,
            Schema.new_builder().column(
                "f1", DataTypes.STRING()).column_by_metadata(
                    "rowtime",
                    DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                        "ignored", str(col("f1").upper_case)).column(
                            "f0", DataTypes.INT()).build())

        # test event time window and field access
        result.key_by(lambda k: k.f1) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(A,47)', '(C,1000)', '(C,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)

Esempio n. 4

0

Mostra file

    def test_from_data_stream_with_schema(self):
        from pyflink.table import Schema

        ds = self.env.from_collection(
            [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
            type_info=Types.ROW_NAMED(
                ["a", "b", "c"],
                [Types.INT(), Types.STRING(),
                 Types.STRING()]))

        table = self.t_env.from_data_stream(
            ds,
            Schema.new_builder().column("a", DataTypes.INT()).column(
                "b", DataTypes.STRING()).column("c",
                                                DataTypes.STRING()).build())
        result = table.execute()
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [
                item for item in map(
                    str, [Row(1, 'Hi', 'Hello'),
                          Row(2, 'Hello', 'Hi')])
            ]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

Esempio n. 5

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

    def test_to_string(self):
        schema = Schema.new_builder().column("f0", DataTypes.STRING()).build()
        format_descriptor = FormatDescriptor \
            .for_format("test-format") \
            .option(self.option_a, False) \
            .build()
        table_descriptor = TableDescriptor.for_connector("test-connector") \
            .schema(schema) \
            .partitioned_by("f0") \
            .option(self.option_a, True) \
            .format(format_descriptor) \
            .comment("Test Comment") \
            .build()
        self.assertEqual("test-format[{a=false}]", str(format_descriptor))
        self.assertEqual(
            """(
  `f0` STRING
)
COMMENT 'Test Comment'
PARTITIONED BY (`f0`)
WITH (
  'a' = 'true',
  'connector' = 'test-connector',
  'test-format.a' = 'false',
  'format' = 'test-format'
)""", str(table_descriptor))

Esempio n. 6

0

Mostra file

    def test_create_temporary_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        t_env = self.t_env
        catalog = t_env.get_current_catalog()
        database = t_env.get_current_database()
        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        t_env.create_temporary_table(
            "T",
            TableDescriptor.for_connector("fake")
             .schema(schema)
             .option("a", "Test")
             .build())

        self.assertFalse(t_env.get_catalog(catalog).table_exists(ObjectPath(database, "T")))
        gateway = get_gateway()

        catalog_table = CatalogBaseTable(
            t_env._j_tenv.getCatalogManager()
                 .getTable(gateway.jvm.ObjectIdentifier.of(catalog, database, "T"))
                 .get()
                 .getTable())
        self.assertEqual(schema, catalog_table.get_unresolved_schema())
        self.assertEqual("fake", catalog_table.get_options().get("connector"))
        self.assertEqual("Test", catalog_table.get_options().get("a"))

Esempio n. 7

0

Mostra file

File: test_shell_example.py Progetto: yangrong688/flink

    def test_stream_case(self):
        from pyflink.shell import s_env, st_env, DataTypes
        from pyflink.table.schema import Schema
        from pyflink.table.table_descriptor import TableDescriptor, FormatDescriptor
        # example begin

        import tempfile
        import os
        import shutil
        sink_path = tempfile.gettempdir() + '/streaming.csv'
        if os.path.exists(sink_path):
            if os.path.isfile(sink_path):
                os.remove(sink_path)
            else:
                shutil.rmtree(sink_path)
        s_env.set_parallelism(1)
        t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                                 ['a', 'b', 'c'])

        st_env.create_temporary_table(
            "stream_sink",
            TableDescriptor.for_connector("filesystem").schema(
                Schema.new_builder().column("a", DataTypes.BIGINT()).column(
                    "b", DataTypes.STRING()).column(
                        "c", DataTypes.STRING()).build()).option(
                            "path", sink_path).format(
                                FormatDescriptor.for_format("csv").option(
                                    "field-delimiter", ",").build()).build())

        t.select(t.a + 1, t.b, t.c).execute_insert("stream_sink").wait()

        # verify code, do not copy these code to shell.py
        with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
            lines = f.read()
            self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')

Esempio n. 8

0

Mostra file

    def get_unresolved_schema(self) -> Schema:
        """
        Returns the schema of the table or view.

        The schema can reference objects from other catalogs and will be resolved and validated by
        the framework when accessing the table or view.
        """
        return Schema(self._j_catalog_base_table.getUnresolvedSchema())

Esempio n. 9

0

Mostra file

File: test_table_environment_api.py Progetto: Eureka1996/flink

    def test_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        descriptor = TableDescriptor.for_connector("fake").schema(
            schema).build()

        table = self.t_env.from_descriptor(descriptor)
        self.assertEqual(
            schema,
            Schema(Schema.new_builder()._j_builder.fromResolvedSchema(
                table._j_table.getResolvedSchema()).build()))
        table = CatalogBaseTable(
            self.t_env._j_tenv.getCatalogManager().getTable(
                table._j_table.getQueryOperation().getTableIdentifier()).get(
                ).getTable())
        self.assertEqual("fake", table.get_options().get("connector"))

Esempio n. 10

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

 def test_format_basic(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .format("json") \
         .build()
     self.assertEqual(2, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("json", descriptor.get_options().get("format"))

Esempio n. 11

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

 def test_execute_insert_to_table_descriptor(self):
     schema = Schema.new_builder() \
         .column("f0", DataTypes.STRING()) \
         .build()
     table = self.t_env.from_descriptor(
         TableDescriptor.for_connector("datagen").option(
             "number-of-rows", '10').schema(schema).build())
     table_result = table.execute_insert(
         TableDescriptor.for_connector("blackhole").schema(schema).build())
     table_result.collect()

Esempio n. 12

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

 def test_options(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .option(self.option_a, False) \
         .option(self.option_b, 42) \
         .option("c", "C") \
         .build()
     self.assertEqual(4, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("false", descriptor.get_options().get("a"))
     self.assertEqual("42", descriptor.get_options().get("b"))
     self.assertEqual("C", descriptor.get_options().get("c"))

Esempio n. 13

0

Mostra file

    def test_schema_basic(self):
        old_schema = Schema.new_builder() \
            .from_row_data_type(DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())])) \
            .from_fields(["d", "e"], [DataTypes.STRING(), DataTypes.BOOLEAN()]) \
            .build()
        self.schema = Schema.new_builder() \
            .from_schema(old_schema) \
            .primary_key_named("primary_constraint", "id") \
            .column("id", DataTypes.INT().not_null()) \
            .column("counter", DataTypes.INT().not_null()) \
            .column("payload", "ROW<name STRING, age INT, flag BOOLEAN>") \
            .column_by_metadata("topic", DataTypes.STRING(), None, True) \
            .column_by_expression("ts", call_sql("orig_ts - INTERVAL '60' MINUTE")) \
            .column_by_metadata("orig_ts", DataTypes.TIMESTAMP(3), "timestamp") \
            .watermark("ts", "ts - INTERVAL '5' SECOND") \
            .column_by_expression("proctime", "PROCTIME()") \
            .build()
        self.assertEqual(
            """(
  `a` TINYINT,
  `b` SMALLINT,
  `c` INT,
  `d` STRING,
  `e` BOOLEAN,
  `id` INT NOT NULL,
  `counter` INT NOT NULL,
  `payload` [ROW<name STRING, age INT, flag BOOLEAN>],
  `topic` METADATA VIRTUAL,
  `ts` AS [orig_ts - INTERVAL '60' MINUTE],
  `orig_ts` METADATA FROM 'timestamp',
  `proctime` AS [PROCTIME()],
  WATERMARK FOR `ts` AS [ts - INTERVAL '5' SECOND],
  CONSTRAINT `primary_constraint` PRIMARY KEY (`id`) NOT ENFORCED
)""", str(self.schema))

Esempio n. 14

0

Mostra file

    def test_create_table_from_descriptor(self):
        from pyflink.table.schema import Schema

        catalog = self.t_env.get_current_catalog()
        database = self.t_env.get_current_database()
        schema = Schema.new_builder().column("f0", DataTypes.INT()).build()
        self.t_env.create_table(
            "T",
            TableDescriptor.for_connector("fake").schema(schema).option(
                "a", "Test").build())
        object_path = ObjectPath(database, "T")
        self.assertTrue(
            self.t_env.get_catalog(catalog).table_exists(object_path))

        catalog_table = self.t_env.get_catalog(catalog).get_table(object_path)
        self.assertEqual(schema, catalog_table.get_unresolved_schema())
        self.assertEqual("fake", catalog_table.get_options().get("connector"))
        self.assertEqual("Test", catalog_table.get_options().get("a"))

Esempio n. 15

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

    def test_statement_set_insert_using_table_descriptor(self):
        schema = Schema.new_builder() \
            .column("f0", DataTypes.INT()) \
            .build()

        source_descriptor = TableDescriptor.for_connector("datagen") \
            .schema(schema) \
            .option("number-of-rows", '10') \
            .build()

        sink_descriptor = TableDescriptor.for_connector("blackhole") \
            .schema(schema) \
            .build()

        self.t_env.create_temporary_table("T", source_descriptor)

        stmt_set = self.t_env.create_statement_set()
        stmt_set.add_insert(sink_descriptor, self.t_env.from_path("T"))

        stmt_set.execute().wait()

Esempio n. 16

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

 def test_format_with_format_descriptor(self):
     descriptor = TableDescriptor.for_connector("test-connector") \
         .schema(Schema.new_builder().build()) \
         .format(FormatDescriptor.for_format("test-format")
                 .option(self.option_a, True)
                 .option(self.option_b, 42)
                 .option("c", "C")
                 .build(),
                 self.key_format) \
         .build()
     self.assertEqual(5, len(descriptor.get_options()))
     self.assertEqual("test-connector",
                      descriptor.get_options().get("connector"))
     self.assertEqual("test-format",
                      descriptor.get_options().get("key.format"))
     self.assertEqual("true",
                      descriptor.get_options().get("key.test-format.a"))
     self.assertEqual("42",
                      descriptor.get_options().get("key.test-format.b"))
     self.assertEqual("C",
                      descriptor.get_options().get("key.test-format.c"))

Esempio n. 17

0

Mostra file

File: test_table_descriptor.py Progetto: Eureka1996/flink

    def test_basic(self):
        schema = Schema.new_builder() \
            .column("f0", DataTypes.STRING()) \
            .column("f1", DataTypes.BIGINT()) \
            .primary_key("f0") \
            .build()

        descriptor = TableDescriptor.for_connector("test-connector") \
            .schema(schema) \
            .partitioned_by("f0") \
            .comment("Test Comment") \
            .build()

        self.assertIsNotNone(descriptor.get_schema())

        self.assertEqual(1, len(descriptor.get_partition_keys()))
        self.assertEqual("f0", descriptor.get_partition_keys()[0])

        self.assertEqual(1, len(descriptor.get_options()))
        self.assertEqual("test-connector",
                         descriptor.get_options().get("connector"))

        self.assertEqual("Test Comment", descriptor.get_comment())

Esempio n. 18

0

Mostra file

import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c'])

st_env.create_temporary_table("csv_sink", TableDescriptor.for_connector("filesystem")
    .schema(Schema.new_builder()
        .column("a", DataTypes.BIGINT())
        .column("b", DataTypes.STRING())
        .column("c", DataTypes.STRING())
        .build())
    .option("path", sink_path)
    .format(FormatDescriptor.for_format("csv")
        .option("field-delimiter", ",")
        .build())
    .build())

t.select("a + 1, b, c").execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')

Esempio n. 19

0

Mostra file

 def get_schema(self) -> Optional[Schema]:
     j_schema = self._j_table_descriptor.getSchema()
     if j_schema.isPresent():
         return Schema(j_schema.get())
     else:
         return None

Esempio n. 20

0

Mostra file

import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                         ['a', 'b', 'c'])

st_env.create_temporary_table(
    "csv_sink",
    TableDescriptor.for_connector("filesystem").schema(
        Schema.new_builder().column(
            "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column(
                "c",
                DataTypes.STRING()).build()).option("path", sink_path).format(
                    FormatDescriptor.for_format("csv").option(
                        "field-delimiter", ",").build()).build())

t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')