Esempio n. 1
0
 def get_accumulator_type(self):
     return DataTypes.ROW([
         DataTypes.FIELD(
             "f0", DataTypes.MAP_VIEW(DataTypes.STRING(),
                                      DataTypes.BIGINT())),
         DataTypes.FIELD("f1", DataTypes.BIGINT())
     ])
Esempio n. 2
0
 def sql_type(cls):
     return DataTypes.ROW([
         DataTypes.FIELD("type", DataTypes.TINYINT()),
         DataTypes.FIELD("size", DataTypes.INT()),
         DataTypes.FIELD("indices", DataTypes.ARRAY(DataTypes.INT())),
         DataTypes.FIELD("values", DataTypes.ARRAY(DataTypes.DOUBLE())),
     ])
Esempio n. 3
0
 def get_result_type(self):
     return DataTypes.ROW([
         DataTypes.FIELD("f0", DataTypes.STRING()),
         DataTypes.FIELD("f1", DataTypes.STRING()),
         DataTypes.FIELD("f2", DataTypes.STRING()),
         DataTypes.FIELD("f3", DataTypes.BIGINT())
     ])
Esempio n. 4
0
    def test_from_element(self):
        t_env = self.t_env
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(),
            DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(10, 0),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            ExamplePointUDT(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(
                  1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), ExamplePoint(
                  1.0, 2.0), PythonOnlyPoint(3.0, 4.0))], schema)
        t.insert_into("Results")
        self.env.execute()
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '1970-01-02 00:00:00.0,86400000010,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Esempio n. 5
0
    def test_row_type_as_input_types_and_result_types(self):
        # test input_types and result_types are DataTypes.ROW
        a = udtf(lambda i: i,
                 input_types=DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())]),
                 result_types=DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())]))

        self.assertEqual(a._input_types,
                         [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])])
        self.assertEqual(a._result_types,
                         [DataTypes.ROW([DataTypes.FIELD("a", DataTypes.BIGINT())])])
Esempio n. 6
0
    def test_blink_from_element(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().in_batch_mode().build())
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.INTERVAL(DataTypes.SECOND(3)),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(38, 18),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema)
        t.insert_into("Results")
        t_env.execute("test")
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Esempio n. 7
0
    def test_from_element(self):
        t_env = self.t_env
        a = array.array('b')
        a.fromstring('ABCD')
        t = t_env.from_elements([
            (1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
             datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0,
                                                       0), [1.0, None],
             array.array("d",
                         [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)],
             Decimal(1), Row("a", "b")(1, 2.0), {
                 "key": 1.0
             }, a, ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
        ])
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.VARCHAR(), DataTypes.DOUBLE()),
            DataTypes.VARBINARY(),
            ExamplePointUDT(),
            PythonOnlyUDT()
        ]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        t.insert_into("Results")
        t_env.exec_env().execute()
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,[1.0, null],'
            '[1.0, 2.0],[abc],[1970-01-02],1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],'
            '[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
def conversion_from_dataframe():
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    t_env.get_config().set("parallelism.default", "1")

    # define the source with watermark definition
    pdf = pd.DataFrame(np.random.rand(1000, 2))
    table = t_env.from_pandas(pdf,
                              schema=DataTypes.ROW([
                                  DataTypes.FIELD("a", DataTypes.DOUBLE()),
                                  DataTypes.FIELD("b", DataTypes.DOUBLE())
                              ]))

    print(table.to_pandas())
Esempio n. 9
0
    def test_data_types_only_supported_in_blink_planner(self):
        timezone = self.t_env.get_config().get_local_timezone()
        local_datetime = pytz.timezone(timezone).localize(
            datetime.datetime(1970, 1, 1, 0, 0, 0, 123000))

        @udf(result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))
        def local_zoned_timestamp_func(local_zoned_timestamp_param):
            assert local_zoned_timestamp_param == local_datetime, \
                'local_zoned_timestamp_param is wrong value %s !' % local_zoned_timestamp_param
            return local_zoned_timestamp_param

        table_sink = source_sink_utils.TestAppendSink(
            ['a'], [DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(local_datetime, )],
            DataTypes.ROW([
                DataTypes.FIELD("a",
                                DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))
            ]))

        exec_insert_table(
            t.select(
                local_zoned_timestamp_func(local_zoned_timestamp_func(t.a))),
            "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1970-01-01T00:00:00.123Z"])
Esempio n. 10
0
    def test_data_types_only_supported_in_blink_planner(self):
        import pandas as pd

        timezone = self.t_env.get_config().get_local_timezone()
        local_datetime = pytz.timezone(timezone).localize(
            datetime.datetime(1970, 1, 2, 0, 0, 0, 123000))

        @udf(result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3),
             func_type="pandas")
        def local_zoned_timestamp_func(local_zoned_timestamp_param):
            assert isinstance(local_zoned_timestamp_param, pd.Series)
            assert isinstance(local_zoned_timestamp_param[0], datetime.datetime), \
                'local_zoned_timestamp_param of wrong type %s !' % type(
                    local_zoned_timestamp_param[0])
            assert local_zoned_timestamp_param[0] == local_datetime, \
                'local_zoned_timestamp_param is wrong value %s, %s!' % \
                (local_zoned_timestamp_param[0], local_datetime)
            return local_zoned_timestamp_param

        table_sink = source_sink_utils.TestAppendSink(
            ['a'], [DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(local_datetime, )],
            DataTypes.ROW([
                DataTypes.FIELD("a",
                                DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))
            ]))

        t.select(local_zoned_timestamp_func(local_zoned_timestamp_func(t.a))) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1970-01-02T00:00:00.123Z]"])
Esempio n. 11
0
    def test_data_types(self):
        import pandas as pd

        timezone = self.t_env.get_config().get_local_timezone()
        local_datetime = pytz.timezone(timezone).localize(
            datetime.datetime(1970, 1, 2, 0, 0, 0, 123000))

        @udf(result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3),
             func_type="pandas")
        def local_zoned_timestamp_func(local_zoned_timestamp_param):
            assert isinstance(local_zoned_timestamp_param, pd.Series)
            assert isinstance(local_zoned_timestamp_param[0], datetime.datetime), \
                'local_zoned_timestamp_param of wrong type %s !' % type(
                    local_zoned_timestamp_param[0])
            assert local_zoned_timestamp_param[0] == local_datetime, \
                'local_zoned_timestamp_param is wrong value %s, %s!' % \
                (local_zoned_timestamp_param[0], local_datetime)
            return local_zoned_timestamp_param

        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP_LTZ(3)) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements(
            [(local_datetime, )],
            DataTypes.ROW([
                DataTypes.FIELD("a",
                                DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))
            ]))

        t.select(local_zoned_timestamp_func(local_zoned_timestamp_func(t.a))) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1970-01-02T00:00:00.123Z]"])
Esempio n. 12
0
    def test_from_element_expression(self):
        t_env = self.t_env

        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.FLOAT()
        ]

        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        sink_table_ddl = """
            CREATE TABLE Results(a BIGINT, b STRING, c FLOAT)
            WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        t = t_env.from_elements(
            [row(1, 'abc', 2.0), row(2, 'def', 3.0)], schema)
        t.execute_insert("Results").wait()
        actual = source_sink_utils.results()

        expected = ['+I[1, abc, 2.0]', '+I[2, def, 3.0]']
        self.assert_equals(actual, expected)
Esempio n. 13
0
 def test_collect_null_value_result(self):
     element_data = [(1, None, 'a'),
                     (3, 4, 'b'),
                     (5, None, 'a'),
                     (7, 8, 'b')]
     source = self.t_env.from_elements(element_data,
                                       DataTypes.ROW([DataTypes.FIELD('a', DataTypes.INT()),
                                                      DataTypes.FIELD('b', DataTypes.INT()),
                                                      DataTypes.FIELD('c', DataTypes.STRING())]))
     table_result = source.execute()
     expected_result = [Row(1, None, 'a'), Row(3, 4, 'b'), Row(5, None, 'a'),
                        Row(7, 8, 'b')]
     with table_result.collect() as results:
         collected_result = []
         for result in results:
             collected_result.append(result)
         self.assertEqual(collected_result, expected_result)
 def test_udt(self):
     self.t_env.from_elements([
         (DenseVector([1, 2, 3, 4]), 0., 1.),
         (DenseVector([2, 2, 3, 4]), 0., 2.),
         (DenseVector([3, 2, 3, 4]), 0., 3.),
         (DenseVector([4, 2, 3, 4]), 0., 4.),
         (DenseVector([5, 2, 3, 4]), 0., 5.),
         (DenseVector([11, 2, 3, 4]), 1., 1.),
         (DenseVector([12, 2, 3, 4]), 1., 2.),
         (DenseVector([13, 2, 3, 4]), 1., 3.),
         (DenseVector([14, 2, 3, 4]), 1., 4.),
         (DenseVector([15, 2, 3, 4]), 1., 5.),
     ],
         DataTypes.ROW([
             DataTypes.FIELD("features", VectorUDT()),
             DataTypes.FIELD("label", DataTypes.DOUBLE()),
             DataTypes.FIELD("weight", DataTypes.DOUBLE())]))
Esempio n. 15
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("universal")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.STRING()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("eventTime", DataTypes.STRING())) \
        .in_append_mode() \
        .create_temporary_table("source")
Esempio n. 16
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())
            .rowtime(
            Rowtime()
                .timestamps_from_field("eventTime")
                .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Esempio n. 17
0
def register_rides_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())) \
        .in_append_mode() \
        .register_table_sink("sink")
Esempio n. 18
0
def register_transactions_source(st_env):
    st_env.connect(Kafka()
                   .version("universal")
                   .topic("transactions-data")
                   .start_from_latest()
                   .property("zookeeper.connect", "host.docker.internal:2181")
                   .property("bootstrap.servers", "host.docker.internal:19091")) \
        .with_format(Json()
        .fail_on_missing_field(True)
        .schema(DataTypes.ROW([
        DataTypes.FIELD("customer", DataTypes.STRING()),
        DataTypes.FIELD("transaction_type", DataTypes.STRING()),
        DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()),
        DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()),
        DataTypes.FIELD("lat", DataTypes.DOUBLE()),
        DataTypes.FIELD("lon", DataTypes.DOUBLE()),
        DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \
        .with_schema(Schema()
        .field("customer", DataTypes.STRING())
        .field("transaction_type", DataTypes.STRING())
        .field("online_payment_amount", DataTypes.DOUBLE())
        .field("in_store_payment_amount", DataTypes.DOUBLE())
        .field("lat", DataTypes.DOUBLE())
        .field("lon", DataTypes.DOUBLE())
        .field("rowtime", DataTypes.TIMESTAMP())
        .rowtime(
        Rowtime()
            .timestamps_from_field("transaction_datetime")
            .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Esempio n. 19
0
 def setUp(self):
     super(BucketizerTest, self).setUp()
     self.input_table = self.t_env.from_elements(
         [(1, -0.5, 0.0, 1.0), (2, float('-inf'), 1.0, float('inf')),
          (3, float('nan'), -0.5, -0.5)],
         DataTypes.ROW([
             DataTypes.FIELD("id", DataTypes.INT()),
             DataTypes.FIELD("f1", DataTypes.DOUBLE()),
             DataTypes.FIELD("f2", DataTypes.DOUBLE()),
             DataTypes.FIELD("f3", DataTypes.DOUBLE())
         ]))
     self.splits_array = ((-0.5, 0.0, 0.5), (-1.0, 0.0, 2.0),
                          (float('-inf'), 10.0, float('inf')))
     self.expected_keep_result = [
         Row(1, 0, 1, 0), Row(2, 2, 1, 1),
         Row(3, 2, 0, 0)
     ]
     self.expected_skip_result = [Row(1, 0, 1, 0)]
Esempio n. 20
0
 def get_accumulator_type(self):
   return DataTypes.ROW([
       DataTypes.FIELD("available", DataTypes.BOOLEAN()),
       DataTypes.FIELD("timestamp", DataTypes.BIGINT()),
       DataTypes.FIELD("batch_num", DataTypes.BIGINT()),
       DataTypes.FIELD("sold_last", DataTypes.LIST_VIEW(DataTypes.FLOAT())),
       DataTypes.FIELD("price_last", DataTypes.LIST_VIEW(DataTypes.FLOAT())),
       DataTypes.FIELD("review_total_last", DataTypes.LIST_VIEW(DataTypes.INT())),
       DataTypes.FIELD("info", DataTypes.STRING())])
Esempio n. 21
0
def register_ride_duration_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("durationMin", DataTypes.BIGINT())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("durationMin", DataTypes.BIGINT())) \
        .in_append_mode() \
        .register_table_sink("TempResults")
Esempio n. 22
0
def word_count(input_path, output_path):
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    # write all the data to one file
    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    # define the source
    if input_path is not None:
        t_env.create_temporary_table(
            'source',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column(
                    'word', DataTypes.STRING()).build()).option(
                        'path', input_path).format('csv').build())
        tab = t_env.from_path('source')
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        tab = t_env.from_elements(
            map(lambda i: (i, ), word_count_data),
            DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())]))

    # define the sink
    if output_path is not None:
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count',
                    DataTypes.BIGINT()).build()).option('path', output_path).
            format(FormatDescriptor.for_format('canal-json').build()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('print').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count', DataTypes.BIGINT()).build()).build())

    @udtf(result_types=[DataTypes.STRING()])
    def split(line: Row):
        for s in line[0].split():
            yield Row(s)

    # compute word count
    tab.flat_map(split).alias('word') \
       .group_by(col('word')) \
       .select(col('word'), lit(1).count) \
       .execute_insert('sink') \
       .wait()
def register_transactions_source(st_env):
    st_env.connect(Kafka()
                   .version("universal")
                   .topic("server-logs")
                   .start_from_earliest()
                   .property("zookeeper.connect", "localhost:2181")
                   .property("bootstrap.servers", "localhost:9092")) \
        .with_format(Json()
        .fail_on_missing_field(True)
        .schema(DataTypes.ROW([
        DataTypes.FIELD("event_id", DataTypes.STRING()),
        DataTypes.FIELD("account_id", DataTypes.DOUBLE()),
        DataTypes.FIELD("event_type", DataTypes.DOUBLE()),
        DataTypes.FIELD("location_country", DataTypes.DOUBLE()),
        DataTypes.FIELD("event_timestamp", DataTypes.TIMESTAMP(precision=3))]))) \
        .with_schema(Schema()
        .field("event_id", DataTypes.STRING())
        .field("account_id", DataTypes.DOUBLE())
        .field("event_type", DataTypes.STRING())
        .field("location_country", DataTypes.STRING())
        .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \
        .in_append_mode() \
        .create_temporary_table("source")
Esempio n. 24
0
    def test_schema_basic(self):
        old_schema = Schema.new_builder() \
            .from_row_data_type(DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())])) \
            .from_fields(["d", "e"], [DataTypes.STRING(), DataTypes.BOOLEAN()]) \
            .build()
        self.schema = Schema.new_builder() \
            .from_schema(old_schema) \
            .primary_key_named("primary_constraint", "id") \
            .column("id", DataTypes.INT().not_null()) \
            .column("counter", DataTypes.INT().not_null()) \
            .column("payload", "ROW<name STRING, age INT, flag BOOLEAN>") \
            .column_by_metadata("topic", DataTypes.STRING(), None, True) \
            .column_by_expression("ts", call_sql("orig_ts - INTERVAL '60' MINUTE")) \
            .column_by_metadata("orig_ts", DataTypes.TIMESTAMP(3), "timestamp") \
            .watermark("ts", "ts - INTERVAL '5' SECOND") \
            .column_by_expression("proctime", "PROCTIME()") \
            .build()
        self.assertEqual(
            """(
  `a` TINYINT,
  `b` SMALLINT,
  `c` INT,
  `d` STRING,
  `e` BOOLEAN,
  `id` INT NOT NULL,
  `counter` INT NOT NULL,
  `payload` [ROW<name STRING, age INT, flag BOOLEAN>],
  `topic` METADATA VIRTUAL,
  `ts` AS [orig_ts - INTERVAL '60' MINUTE],
  `orig_ts` METADATA FROM 'timestamp',
  `proctime` AS [PROCTIME()],
  WATERMARK FOR `ts` AS [ts - INTERVAL '5' SECOND],
  CONSTRAINT `primary_constraint` PRIMARY KEY (`id`) NOT ENFORCED
)""", str(self.schema))
Esempio n. 25
0
    def test_from_element_expression(self):
        t_env = self.t_env

        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.FLOAT()]

        schema = DataTypes.ROW(
            list(map(lambda field_name, field_type: DataTypes.FIELD(field_name, field_type),
                     field_names,
                     field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements([row(1, 'abc', 2.0), row(2, 'def', 3.0)], schema)
        t.execute_insert("Results").wait()
        actual = source_sink_utils.results()

        expected = ['+I[1, abc, 2.0]', '+I[2, def, 3.0]']
        self.assert_equals(actual, expected)
Esempio n. 26
0
    def test_all_data_types(self):
        import pandas as pd
        import numpy as np

        @udf(result_type=DataTypes.TINYINT(), func_type="pandas")
        def tinyint_func(tinyint_param):
            assert isinstance(tinyint_param, pd.Series)
            assert isinstance(tinyint_param[0], np.int8), \
                'tinyint_param of wrong type %s !' % type(tinyint_param[0])
            return tinyint_param

        @udf(result_type=DataTypes.SMALLINT(), func_type="pandas")
        def smallint_func(smallint_param):
            assert isinstance(smallint_param, pd.Series)
            assert isinstance(smallint_param[0], np.int16), \
                'smallint_param of wrong type %s !' % type(smallint_param[0])
            assert smallint_param[
                0] == 32767, 'smallint_param of wrong value %s' % smallint_param
            return smallint_param

        @udf(result_type=DataTypes.INT(), func_type="pandas")
        def int_func(int_param):
            assert isinstance(int_param, pd.Series)
            assert isinstance(int_param[0], np.int32), \
                'int_param of wrong type %s !' % type(int_param[0])
            assert int_param[
                0] == -2147483648, 'int_param of wrong value %s' % int_param
            return int_param

        @udf(result_type=DataTypes.BIGINT(), func_type="pandas")
        def bigint_func(bigint_param):
            assert isinstance(bigint_param, pd.Series)
            assert isinstance(bigint_param[0], np.int64), \
                'bigint_param of wrong type %s !' % type(bigint_param[0])
            return bigint_param

        @udf(result_type=DataTypes.BOOLEAN(), func_type="pandas")
        def boolean_func(boolean_param):
            assert isinstance(boolean_param, pd.Series)
            assert isinstance(boolean_param[0], np.bool_), \
                'boolean_param of wrong type %s !' % type(boolean_param[0])
            return boolean_param

        @udf(result_type=DataTypes.FLOAT(), func_type="pandas")
        def float_func(float_param):
            assert isinstance(float_param, pd.Series)
            assert isinstance(float_param[0], np.float32), \
                'float_param of wrong type %s !' % type(float_param[0])
            return float_param

        @udf(result_type=DataTypes.DOUBLE(), func_type="pandas")
        def double_func(double_param):
            assert isinstance(double_param, pd.Series)
            assert isinstance(double_param[0], np.float64), \
                'double_param of wrong type %s !' % type(double_param[0])
            return double_param

        @udf(result_type=DataTypes.STRING(), func_type="pandas")
        def varchar_func(varchar_param):
            assert isinstance(varchar_param, pd.Series)
            assert isinstance(varchar_param[0], str), \
                'varchar_param of wrong type %s !' % type(varchar_param[0])
            return varchar_param

        @udf(result_type=DataTypes.BYTES(), func_type="pandas")
        def varbinary_func(varbinary_param):
            assert isinstance(varbinary_param, pd.Series)
            assert isinstance(varbinary_param[0], bytes), \
                'varbinary_param of wrong type %s !' % type(varbinary_param[0])
            return varbinary_param

        @udf(result_type=DataTypes.DECIMAL(38, 18), func_type="pandas")
        def decimal_func(decimal_param):
            assert isinstance(decimal_param, pd.Series)
            assert isinstance(decimal_param[0], decimal.Decimal), \
                'decimal_param of wrong type %s !' % type(decimal_param[0])
            return decimal_param

        @udf(result_type=DataTypes.DATE(), func_type="pandas")
        def date_func(date_param):
            assert isinstance(date_param, pd.Series)
            assert isinstance(date_param[0], datetime.date), \
                'date_param of wrong type %s !' % type(date_param[0])
            return date_param

        @udf(result_type=DataTypes.TIME(), func_type="pandas")
        def time_func(time_param):
            assert isinstance(time_param, pd.Series)
            assert isinstance(time_param[0], datetime.time), \
                'time_param of wrong type %s !' % type(time_param[0])
            return time_param

        timestamp_value = datetime.datetime(1970, 1, 2, 0, 0, 0, 123000)

        @udf(result_type=DataTypes.TIMESTAMP(3), func_type="pandas")
        def timestamp_func(timestamp_param):
            assert isinstance(timestamp_param, pd.Series)
            assert isinstance(timestamp_param[0], datetime.datetime), \
                'timestamp_param of wrong type %s !' % type(timestamp_param[0])
            assert timestamp_param[0] == timestamp_value, \
                'timestamp_param is wrong value %s, should be %s!' % (timestamp_param[0],
                                                                      timestamp_value)
            return timestamp_param

        def array_func(array_param):
            assert isinstance(array_param, pd.Series)
            assert isinstance(array_param[0], np.ndarray), \
                'array_param of wrong type %s !' % type(array_param[0])
            return array_param

        array_str_func = udf(array_func,
                             result_type=DataTypes.ARRAY(DataTypes.STRING()),
                             func_type="pandas")

        array_timestamp_func = udf(array_func,
                                   result_type=DataTypes.ARRAY(
                                       DataTypes.TIMESTAMP(3)),
                                   func_type="pandas")

        array_int_func = udf(array_func,
                             result_type=DataTypes.ARRAY(DataTypes.INT()),
                             func_type="pandas")

        @udf(result_type=DataTypes.ARRAY(DataTypes.STRING()),
             func_type="pandas")
        def nested_array_func(nested_array_param):
            assert isinstance(nested_array_param, pd.Series)
            assert isinstance(nested_array_param[0], np.ndarray), \
                'nested_array_param of wrong type %s !' % type(nested_array_param[0])
            return pd.Series(nested_array_param[0])

        row_type = DataTypes.ROW([
            DataTypes.FIELD("f1", DataTypes.INT()),
            DataTypes.FIELD("f2", DataTypes.STRING()),
            DataTypes.FIELD("f3", DataTypes.TIMESTAMP(3)),
            DataTypes.FIELD("f4", DataTypes.ARRAY(DataTypes.INT()))
        ])

        @udf(result_type=row_type, func_type="pandas")
        def row_func(row_param):
            assert isinstance(row_param, pd.DataFrame)
            assert isinstance(row_param.f1, pd.Series)
            assert isinstance(row_param.f1[0], np.int32), \
                'row_param.f1 of wrong type %s !' % type(row_param.f1[0])
            assert isinstance(row_param.f2, pd.Series)
            assert isinstance(row_param.f2[0], str), \
                'row_param.f2 of wrong type %s !' % type(row_param.f2[0])
            assert isinstance(row_param.f3, pd.Series)
            assert isinstance(row_param.f3[0], datetime.datetime), \
                'row_param.f3 of wrong type %s !' % type(row_param.f3[0])
            assert isinstance(row_param.f4, pd.Series)
            assert isinstance(row_param.f4[0], np.ndarray), \
                'row_param.f4 of wrong type %s !' % type(row_param.f4[0])
            return row_param

        table_sink = source_sink_utils.TestAppendSink([
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ], [
            DataTypes.TINYINT(),
            DataTypes.SMALLINT(),
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.BOOLEAN(),
            DataTypes.BOOLEAN(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.BYTES(),
            DataTypes.DECIMAL(38, 18),
            DataTypes.DECIMAL(38, 18),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.TIMESTAMP(3)),
            DataTypes.ARRAY(DataTypes.INT()),
            DataTypes.ARRAY(DataTypes.STRING()), row_type
        ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(1, 32767, -2147483648, 1, True, False, 1.0, 1.0, 'hello', '中文',
              bytearray(b'flink'), decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal(
                  '1000000000000000000.05999999999999999899999999999'),
              datetime.date(2014, 9, 13),
              datetime.time(hour=1, minute=0, second=1), timestamp_value,
              ['hello', '中文', None], [timestamp_value], [1, 2], [[
                  'hello', '中文', None
              ]], Row(1, 'hello', timestamp_value, [1, 2]))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT()),
                DataTypes.FIELD("d", DataTypes.BIGINT()),
                DataTypes.FIELD("e", DataTypes.BOOLEAN()),
                DataTypes.FIELD("f", DataTypes.BOOLEAN()),
                DataTypes.FIELD("g", DataTypes.FLOAT()),
                DataTypes.FIELD("h", DataTypes.DOUBLE()),
                DataTypes.FIELD("i", DataTypes.STRING()),
                DataTypes.FIELD("j", DataTypes.STRING()),
                DataTypes.FIELD("k", DataTypes.BYTES()),
                DataTypes.FIELD("l", DataTypes.DECIMAL(38, 18)),
                DataTypes.FIELD("m", DataTypes.DECIMAL(38, 18)),
                DataTypes.FIELD("n", DataTypes.DATE()),
                DataTypes.FIELD("o", DataTypes.TIME()),
                DataTypes.FIELD("p", DataTypes.TIMESTAMP(3)),
                DataTypes.FIELD("q", DataTypes.ARRAY(DataTypes.STRING())),
                DataTypes.FIELD("r", DataTypes.ARRAY(DataTypes.TIMESTAMP(3))),
                DataTypes.FIELD("s", DataTypes.ARRAY(DataTypes.INT())),
                DataTypes.FIELD(
                    "t", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))),
                DataTypes.FIELD("u", row_type)
            ]))

        t.select(
            tinyint_func(t.a),
            smallint_func(t.b),
            int_func(t.c),
            bigint_func(t.d),
            boolean_func(t.e),
            boolean_func(t.f),
            float_func(t.g),
            double_func(t.h),
            varchar_func(t.i),
            varchar_func(t.j),
            varbinary_func(t.k),
            decimal_func(t.l),
            decimal_func(t.m),
            date_func(t.n),
            time_func(t.o),
            timestamp_func(t.p),
            array_str_func(t.q),
            array_timestamp_func(t.r),
            array_int_func(t.s),
            nested_array_func(t.t),
            row_func(t.u)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 32767, -2147483648, 1, true, false, 1.0, 1.0, hello, 中文, "
            "[102, 108, 105, 110, 107], 1000000000000000000.050000000000000000, "
            "1000000000000000000.059999999999999999, 2014-09-13, 01:00:01, "
            "1970-01-02 00:00:00.123, [hello, 中文, null], [1970-01-02 00:00:00.123], "
            "[1, 2], [hello, 中文, null], +I[1, hello, 1970-01-02 00:00:00.123, [1, 2]]]"
        ])
 def test_collect_for_all_data_types(self):
     expected_result = [
         Row(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
             bytearray(b'pyflink'), 'pyflink', datetime.date(2014, 9, 13),
             datetime.time(12, 0, 0, 123000),
             datetime.datetime(2018, 3, 11, 3, 0, 0, 123000),
             [Row(['[pyflink]']),
              Row(['[pyflink]']),
              Row(['[pyflink]'])], {
                  1: Row(['[flink]']),
                  2: Row(['[pyflink]'])
              }, decimal.Decimal('1000000000000000000.050000000000000000'),
             decimal.Decimal('1000000000000000000.059999999999999999'))
     ]
     source = self.t_env.from_elements(
         [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
           bytearray(b'pyflink'), 'pyflink', datetime.date(2014, 9, 13),
           datetime.time(hour=12, minute=0, second=0, microsecond=123000),
           datetime.datetime(2018, 3, 11, 3, 0, 0, 123000),
           [Row(['pyflink']),
            Row(['pyflink']),
            Row(['pyflink'])], {
                1: Row(['flink']),
                2: Row(['pyflink'])
            }, decimal.Decimal('1000000000000000000.05'),
           decimal.Decimal(
               '1000000000000000000.05999999999999999899999999999'))],
         DataTypes.ROW([
             DataTypes.FIELD("a", DataTypes.BIGINT()),
             DataTypes.FIELD("b", DataTypes.BIGINT()),
             DataTypes.FIELD("c", DataTypes.TINYINT()),
             DataTypes.FIELD("d", DataTypes.BOOLEAN()),
             DataTypes.FIELD("e", DataTypes.SMALLINT()),
             DataTypes.FIELD("f", DataTypes.INT()),
             DataTypes.FIELD("g", DataTypes.FLOAT()),
             DataTypes.FIELD("h", DataTypes.DOUBLE()),
             DataTypes.FIELD("i", DataTypes.BYTES()),
             DataTypes.FIELD("j", DataTypes.STRING()),
             DataTypes.FIELD("k", DataTypes.DATE()),
             DataTypes.FIELD("l", DataTypes.TIME()),
             DataTypes.FIELD("m", DataTypes.TIMESTAMP(3)),
             DataTypes.FIELD(
                 "n",
                 DataTypes.ARRAY(
                     DataTypes.ROW(
                         [DataTypes.FIELD('ss2', DataTypes.STRING())]))),
             DataTypes.FIELD(
                 "o",
                 DataTypes.MAP(
                     DataTypes.BIGINT(),
                     DataTypes.ROW(
                         [DataTypes.FIELD('ss', DataTypes.STRING())]))),
             DataTypes.FIELD("p", DataTypes.DECIMAL(38, 18)),
             DataTypes.FIELD("q", DataTypes.DECIMAL(38, 18))
         ]))
     table_result = source.execute()
     with table_result.collect() as result:
         collected_result = []
         for i in result:
             collected_result.append(i)
         self.assertEqual(expected_result, collected_result)
Esempio n. 28
0
from pyflink.table import AggregateFunction, DataTypes
from pyflink.table.udf import udaf


class WeightedAvg(AggregateFunction):
    def create_accumulator(self):
        # Row(sum, count)
        return Row(0, 0)

    def get_value(self, accumulator: Row) -> float:
        if accumulator[1] == 0:
            return 0
        else:
            return accumulator[0] / accumulator[1]

    def accumulate(self, accumulator: Row, value, weight):
        accumulator[0] += value * weight
        accumulator[1] += weight

    def retract(self, accumulator: Row, value, weight):
        accumulator[0] -= value * weight
        accumulator[1] -= weight


weighted_avg = udaf(f=WeightedAvg(),
                    result_type=DataTypes.DOUBLE(),
                    accumulator_type=DataTypes.ROW([
                        DataTypes.FIELD("f0", DataTypes.BIGINT()),
                        DataTypes.FIELD("f1", DataTypes.BIGINT())
                    ]))
Esempio n. 29
0
    def test_all_data_types(self):
        def boolean_func(bool_param):
            assert isinstance(bool_param, bool), 'bool_param of wrong type %s !' \
                                                 % type(bool_param)
            return bool_param

        def tinyint_func(tinyint_param):
            assert isinstance(tinyint_param, int), 'tinyint_param of wrong type %s !' \
                                                   % type(tinyint_param)
            return tinyint_param

        def smallint_func(smallint_param):
            assert isinstance(smallint_param, int), 'smallint_param of wrong type %s !' \
                                                    % type(smallint_param)
            assert smallint_param == 32767, 'smallint_param of wrong value %s' % smallint_param
            return smallint_param

        def int_func(int_param):
            assert isinstance(int_param, int), 'int_param of wrong type %s !' \
                                               % type(int_param)
            assert int_param == -2147483648, 'int_param of wrong value %s' % int_param
            return int_param

        def bigint_func(bigint_param):
            assert isinstance(bigint_param, int), 'bigint_param of wrong type %s !' \
                                                  % type(bigint_param)
            return bigint_param

        def bigint_func_none(bigint_param):
            assert bigint_param is None, 'bigint_param %s should be None!' % bigint_param
            return bigint_param

        def float_func(float_param):
            assert isinstance(float_param, float) and float_equal(float_param, 1.23, 1e-6), \
                'float_param is wrong value %s !' % float_param
            return float_param

        def double_func(double_param):
            assert isinstance(double_param, float) and float_equal(double_param, 1.98932, 1e-7), \
                'double_param is wrong value %s !' % double_param
            return double_param

        def bytes_func(bytes_param):
            assert bytes_param == b'flink', \
                'bytes_param is wrong value %s !' % bytes_param
            return bytes_param

        def str_func(str_param):
            assert str_param == 'pyflink', \
                'str_param is wrong value %s !' % str_param
            return str_param

        def date_func(date_param):
            from datetime import date
            assert date_param == date(year=2014, month=9, day=13), \
                'date_param is wrong value %s !' % date_param
            return date_param

        def time_func(time_param):
            from datetime import time
            assert time_param == time(hour=12, minute=0, second=0, microsecond=123000), \
                'time_param is wrong value %s !' % time_param
            return time_param

        def timestamp_func(timestamp_param):
            from datetime import datetime
            assert timestamp_param == datetime(2018, 3, 11, 3, 0, 0, 123000), \
                'timestamp_param is wrong value %s !' % timestamp_param
            return timestamp_param

        def array_func(array_param):
            assert array_param == [[1, 2, 3]], \
                'array_param is wrong value %s !' % array_param
            return array_param[0]

        def map_func(map_param):
            assert map_param == {1: 'flink', 2: 'pyflink'}, \
                'map_param is wrong value %s !' % map_param
            return map_param

        def decimal_func(decimal_param):
            from decimal import Decimal
            assert decimal_param == Decimal('1000000000000000000.050000000000000000'), \
                'decimal_param is wrong value %s !' % decimal_param
            return decimal_param

        def decimal_cut_func(decimal_param):
            from decimal import Decimal
            assert decimal_param == Decimal('1000000000000000000.059999999999999999'), \
                'decimal_param is wrong value %s !' % decimal_param
            return decimal_param

        self.t_env.create_temporary_system_function(
            "boolean_func", udf(boolean_func, result_type=DataTypes.BOOLEAN()))

        self.t_env.create_temporary_system_function(
            "tinyint_func", udf(tinyint_func, result_type=DataTypes.TINYINT()))

        self.t_env.create_temporary_system_function(
            "smallint_func",
            udf(smallint_func, result_type=DataTypes.SMALLINT()))

        self.t_env.create_temporary_system_function(
            "int_func", udf(int_func, result_type=DataTypes.INT()))

        self.t_env.create_temporary_system_function(
            "bigint_func", udf(bigint_func, result_type=DataTypes.BIGINT()))

        self.t_env.create_temporary_system_function(
            "bigint_func_none",
            udf(bigint_func_none, result_type=DataTypes.BIGINT()))

        self.t_env.create_temporary_system_function(
            "float_func", udf(float_func, result_type=DataTypes.FLOAT()))

        self.t_env.create_temporary_system_function(
            "double_func", udf(double_func, result_type=DataTypes.DOUBLE()))

        self.t_env.create_temporary_system_function(
            "bytes_func", udf(bytes_func, result_type=DataTypes.BYTES()))

        self.t_env.create_temporary_system_function(
            "str_func", udf(str_func, result_type=DataTypes.STRING()))

        self.t_env.create_temporary_system_function(
            "date_func", udf(date_func, result_type=DataTypes.DATE()))

        self.t_env.create_temporary_system_function(
            "time_func", udf(time_func, result_type=DataTypes.TIME()))

        self.t_env.create_temporary_system_function(
            "timestamp_func",
            udf(timestamp_func, result_type=DataTypes.TIMESTAMP(3)))

        self.t_env.create_temporary_system_function(
            "array_func",
            udf(array_func, result_type=DataTypes.ARRAY(DataTypes.BIGINT())))

        self.t_env.create_temporary_system_function(
            "map_func",
            udf(map_func,
                result_type=DataTypes.MAP(DataTypes.BIGINT(),
                                          DataTypes.STRING())))

        self.t_env.register_function(
            "decimal_func",
            udf(decimal_func, result_type=DataTypes.DECIMAL(38, 18)))

        self.t_env.register_function(
            "decimal_cut_func",
            udf(decimal_cut_func, result_type=DataTypes.DECIMAL(38, 18)))

        table_sink = source_sink_utils.TestAppendSink([
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q'
        ], [
            DataTypes.BIGINT(),
            DataTypes.BIGINT(),
            DataTypes.TINYINT(),
            DataTypes.BOOLEAN(),
            DataTypes.SMALLINT(),
            DataTypes.INT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.BYTES(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.ARRAY(DataTypes.BIGINT()),
            DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING()),
            DataTypes.DECIMAL(38, 18),
            DataTypes.DECIMAL(38, 18)
        ])
        self.t_env.register_table_sink("Results", table_sink)

        import datetime
        import decimal
        t = self.t_env.from_elements(
            [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
              bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13),
              datetime.time(hour=12, minute=0, second=0, microsecond=123000),
              datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [[1, 2, 3]], {
                  1: 'flink',
                  2: 'pyflink'
              }, decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal(
                  '1000000000000000000.05999999999999999899999999999'))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.BIGINT()),
                DataTypes.FIELD("c", DataTypes.TINYINT()),
                DataTypes.FIELD("d", DataTypes.BOOLEAN()),
                DataTypes.FIELD("e", DataTypes.SMALLINT()),
                DataTypes.FIELD("f", DataTypes.INT()),
                DataTypes.FIELD("g", DataTypes.FLOAT()),
                DataTypes.FIELD("h", DataTypes.DOUBLE()),
                DataTypes.FIELD("i", DataTypes.BYTES()),
                DataTypes.FIELD("j", DataTypes.STRING()),
                DataTypes.FIELD("k", DataTypes.DATE()),
                DataTypes.FIELD("l", DataTypes.TIME()),
                DataTypes.FIELD("m", DataTypes.TIMESTAMP(3)),
                DataTypes.FIELD(
                    "n", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT()))),
                DataTypes.FIELD(
                    "o", DataTypes.MAP(DataTypes.BIGINT(),
                                       DataTypes.STRING())),
                DataTypes.FIELD("p", DataTypes.DECIMAL(38, 18)),
                DataTypes.FIELD("q", DataTypes.DECIMAL(38, 18))
            ]))

        exec_insert_table(
            t.select("bigint_func(a), bigint_func_none(b),"
                     "tinyint_func(c), boolean_func(d),"
                     "smallint_func(e),int_func(f),"
                     "float_func(g),double_func(h),"
                     "bytes_func(i),str_func(j),"
                     "date_func(k),time_func(l),"
                     "timestamp_func(m),array_func(n),"
                     "map_func(o),decimal_func(p),"
                     "decimal_cut_func(q)"), "Results")
        actual = source_sink_utils.results()
        # Currently the sink result precision of DataTypes.TIME(precision) only supports 0.
        self.assert_equals(actual, [
            "1,null,1,true,32767,-2147483648,1.23,1.98932,"
            "[102, 108, 105, 110, 107],pyflink,2014-09-13,"
            "12:00:00,2018-03-11 03:00:00.123,[1, 2, 3],"
            "{1=flink, 2=pyflink},1000000000000000000.050000000000000000,"
            "1000000000000000000.059999999999999999"
        ])
Esempio n. 30
0
 def get_accumulator_type(self):
     return DataTypes.ROW([DataTypes.FIELD("f0", DataTypes.LIST_VIEW(DataTypes.STRING()))])