Esempio n. 1
0
 def __init__(self, spec, keyed_state_backend):
     self._window = spec.serialized_fn.group_window
     self._named_property_extractor = self._create_named_property_function()
     self._is_time_window = None
     self._reuse_timer_data = Row()
     self._reuse_key_data = Row()
     super(StreamGroupWindowAggregateOperation, self).__init__(spec, keyed_state_backend)
Esempio n. 2
0
 def test_row_coder(self):
     from pyflink.table import Row
     field_coder = BigIntCoder()
     field_count = 10
     coder = RowCoder([field_coder for _ in range(field_count)])
     v = Row(*[None if i % 2 == 0 else i for i in range(field_count)])
     self.check_coder(coder, v)
Esempio n. 3
0
 def process(self, windowed_value):
     results = [invoker.invoke_eval(windowed_value.value) for invoker in
                self.scalar_function_invokers]
     from pyflink.table import Row
     result = Row(*results)
     # send the execution results back
     self.output_processor.process_outputs(windowed_value, [result])
Esempio n. 4
0
 def decode_from_stream(self, in_stream, nested):
     null_mask = self.read_null_mask(len(self._field_coders), in_stream)
     assert len(null_mask) == len(self._field_coders)
     return Row(*[
         None if null_mask[idx] else self._field_coders[idx].
         decode_from_stream(in_stream, nested)
         for idx in range(0, len(null_mask))
     ])
Esempio n. 5
0
    def test_from_element(self):
        t_env = self.t_env
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(),
            DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(10, 0),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            ExamplePointUDT(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(
                  1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), ExamplePoint(
                  1.0, 2.0), PythonOnlyPoint(3.0, 4.0))], schema)
        t.insert_into("Results")
        self.env.execute()
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '1970-01-02 00:00:00.0,86400000010,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Esempio n. 6
0
 def test_cython_row_coder(self):
     from pyflink.table import Row
     field_count = 2
     data = [Row(*[None if i % 2 == 0 else i for i in range(field_count)])]
     python_field_coders = [coder_impl.RowCoderImpl([coder_impl.BigIntCoderImpl()
                                                     for _ in range(field_count)])]
     cython_field_coders = [coder_impl_fast.RowCoderImpl([coder_impl_fast.BigIntCoderImpl()
                                                          for _ in range(field_count)])]
     self.check_cython_coder(python_field_coders, cython_field_coders, [data])
Esempio n. 7
0
    def test_blink_from_element(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().in_batch_mode().build())
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.INTERVAL(DataTypes.SECOND(3)),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(38, 18),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema)
        t.insert_into("Results")
        t_env.execute("test")
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Esempio n. 8
0
    def test_from_element(self):
        t_env = self.t_env
        a = array.array('b')
        a.fromstring('ABCD')
        t = t_env.from_elements([
            (1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
             datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0,
                                                       0), [1.0, None],
             array.array("d",
                         [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)],
             Decimal(1), Row("a", "b")(1, 2.0), {
                 "key": 1.0
             }, a, ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0))
        ])
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.VARCHAR(), DataTypes.DOUBLE()),
            DataTypes.VARBINARY(),
            ExamplePointUDT(),
            PythonOnlyUDT()
        ]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        t.insert_into("Results")
        t_env.exec_env().execute()
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,[1.0, null],'
            '[1.0, 2.0],[abc],[1970-01-02],1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],'
            '[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Esempio n. 9
0
 def decode_from_stream(self, in_stream, nested):
     return Row(
         *super(RowCoderImpl, self).decode_from_stream(in_stream, nested))
Esempio n. 10
0
    def test_all_data_types(self):
        import pandas as pd
        import numpy as np

        def tinyint_func(tinyint_param):
            assert isinstance(tinyint_param, pd.Series)
            assert isinstance(tinyint_param[0], np.int8), \
                'tinyint_param of wrong type %s !' % type(tinyint_param[0])
            return tinyint_param

        def smallint_func(smallint_param):
            assert isinstance(smallint_param, pd.Series)
            assert isinstance(smallint_param[0], np.int16), \
                'smallint_param of wrong type %s !' % type(smallint_param[0])
            assert smallint_param[
                0] == 32767, 'smallint_param of wrong value %s' % smallint_param
            return smallint_param

        def int_func(int_param):
            assert isinstance(int_param, pd.Series)
            assert isinstance(int_param[0], np.int32), \
                'int_param of wrong type %s !' % type(int_param[0])
            assert int_param[
                0] == -2147483648, 'int_param of wrong value %s' % int_param
            return int_param

        def bigint_func(bigint_param):
            assert isinstance(bigint_param, pd.Series)
            assert isinstance(bigint_param[0], np.int64), \
                'bigint_param of wrong type %s !' % type(bigint_param[0])
            return bigint_param

        def boolean_func(boolean_param):
            assert isinstance(boolean_param, pd.Series)
            assert isinstance(boolean_param[0], np.bool_), \
                'boolean_param of wrong type %s !' % type(boolean_param[0])
            return boolean_param

        def float_func(float_param):
            assert isinstance(float_param, pd.Series)
            assert isinstance(float_param[0], np.float32), \
                'float_param of wrong type %s !' % type(float_param[0])
            return float_param

        def double_func(double_param):
            assert isinstance(double_param, pd.Series)
            assert isinstance(double_param[0], np.float64), \
                'double_param of wrong type %s !' % type(double_param[0])
            return double_param

        def varchar_func(varchar_param):
            assert isinstance(varchar_param, pd.Series)
            assert isinstance(varchar_param[0], str), \
                'varchar_param of wrong type %s !' % type(varchar_param[0])
            return varchar_param

        def varbinary_func(varbinary_param):
            assert isinstance(varbinary_param, pd.Series)
            assert isinstance(varbinary_param[0], bytes), \
                'varbinary_param of wrong type %s !' % type(varbinary_param[0])
            return varbinary_param

        def decimal_func(decimal_param):
            assert isinstance(decimal_param, pd.Series)
            assert isinstance(decimal_param[0], decimal.Decimal), \
                'decimal_param of wrong type %s !' % type(decimal_param[0])
            return decimal_param

        def date_func(date_param):
            assert isinstance(date_param, pd.Series)
            assert isinstance(date_param[0], datetime.date), \
                'date_param of wrong type %s !' % type(date_param[0])
            return date_param

        def time_func(time_param):
            assert isinstance(time_param, pd.Series)
            assert isinstance(time_param[0], datetime.time), \
                'time_param of wrong type %s !' % type(time_param[0])
            return time_param

        timestamp_value = datetime.datetime(1970, 1, 2, 0, 0, 0, 123000)

        def timestamp_func(timestamp_param):
            assert isinstance(timestamp_param, pd.Series)
            assert isinstance(timestamp_param[0], datetime.datetime), \
                'timestamp_param of wrong type %s !' % type(timestamp_param[0])
            assert timestamp_param[0] == timestamp_value, \
                'timestamp_param is wrong value %s, should be %s!' % (timestamp_param[0],
                                                                      timestamp_value)
            return timestamp_param

        def array_func(array_param):
            assert isinstance(array_param, pd.Series)
            assert isinstance(array_param[0], np.ndarray), \
                'array_param of wrong type %s !' % type(array_param[0])
            return array_param

        def nested_array_func(nested_array_param):
            assert isinstance(nested_array_param, pd.Series)
            assert isinstance(nested_array_param[0], np.ndarray), \
                'nested_array_param of wrong type %s !' % type(nested_array_param[0])
            return pd.Series(nested_array_param[0])

        def row_func(row_param):
            assert isinstance(row_param, pd.Series)
            assert isinstance(row_param[0], dict), \
                'row_param of wrong type %s !' % type(row_param[0])
            return row_param

        self.t_env.create_temporary_system_function(
            "tinyint_func",
            udf(tinyint_func,
                result_type=DataTypes.TINYINT(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "smallint_func",
            udf(smallint_func,
                result_type=DataTypes.SMALLINT(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "int_func",
            udf(int_func, result_type=DataTypes.INT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "bigint_func",
            udf(bigint_func, result_type=DataTypes.BIGINT(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "boolean_func",
            udf(boolean_func,
                result_type=DataTypes.BOOLEAN(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "float_func",
            udf(float_func, result_type=DataTypes.FLOAT(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "double_func",
            udf(double_func, result_type=DataTypes.DOUBLE(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "varchar_func",
            udf(varchar_func,
                result_type=DataTypes.STRING(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "varbinary_func",
            udf(varbinary_func,
                result_type=DataTypes.BYTES(),
                udf_type="pandas"))

        self.t_env.register_function(
            "decimal_func",
            udf(decimal_func,
                result_type=DataTypes.DECIMAL(38, 18),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "date_func",
            udf(date_func, result_type=DataTypes.DATE(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "time_func",
            udf(time_func, result_type=DataTypes.TIME(), udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "timestamp_func",
            udf(timestamp_func,
                result_type=DataTypes.TIMESTAMP(3),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "array_str_func",
            udf(array_func,
                result_type=DataTypes.ARRAY(DataTypes.STRING()),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "array_timestamp_func",
            udf(array_func,
                result_type=DataTypes.ARRAY(DataTypes.TIMESTAMP(3)),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "array_int_func",
            udf(array_func,
                result_type=DataTypes.ARRAY(DataTypes.INT()),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function(
            "nested_array_func",
            udf(nested_array_func,
                result_type=DataTypes.ARRAY(DataTypes.STRING()),
                udf_type="pandas"))

        row_type = DataTypes.ROW([
            DataTypes.FIELD("f1", DataTypes.INT()),
            DataTypes.FIELD("f2", DataTypes.STRING()),
            DataTypes.FIELD("f3", DataTypes.TIMESTAMP(3)),
            DataTypes.FIELD("f4", DataTypes.ARRAY(DataTypes.INT()))
        ])
        self.t_env.create_temporary_system_function(
            "row_func", udf(row_func, result_type=row_type, udf_type="pandas"))

        table_sink = source_sink_utils.TestAppendSink([
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u'
        ], [
            DataTypes.TINYINT(),
            DataTypes.SMALLINT(),
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.BOOLEAN(),
            DataTypes.BOOLEAN(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.BYTES(),
            DataTypes.DECIMAL(38, 18),
            DataTypes.DECIMAL(38, 18),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.TIMESTAMP(3)),
            DataTypes.ARRAY(DataTypes.INT()),
            DataTypes.ARRAY(DataTypes.STRING()), row_type
        ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(1, 32767, -2147483648, 1, True, False, 1.0, 1.0, 'hello', '中文',
              bytearray(b'flink'), decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal(
                  '1000000000000000000.05999999999999999899999999999'),
              datetime.date(2014, 9, 13),
              datetime.time(hour=1, minute=0, second=1), timestamp_value,
              ['hello', '中文', None], [timestamp_value], [1, 2], [[
                  'hello', '中文', None
              ]], Row(1, 'hello', timestamp_value, [1, 2]))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT()),
                DataTypes.FIELD("d", DataTypes.BIGINT()),
                DataTypes.FIELD("e", DataTypes.BOOLEAN()),
                DataTypes.FIELD("f", DataTypes.BOOLEAN()),
                DataTypes.FIELD("g", DataTypes.FLOAT()),
                DataTypes.FIELD("h", DataTypes.DOUBLE()),
                DataTypes.FIELD("i", DataTypes.STRING()),
                DataTypes.FIELD("j", DataTypes.STRING()),
                DataTypes.FIELD("k", DataTypes.BYTES()),
                DataTypes.FIELD("l", DataTypes.DECIMAL(38, 18)),
                DataTypes.FIELD("m", DataTypes.DECIMAL(38, 18)),
                DataTypes.FIELD("n", DataTypes.DATE()),
                DataTypes.FIELD("o", DataTypes.TIME()),
                DataTypes.FIELD("p", DataTypes.TIMESTAMP(3)),
                DataTypes.FIELD("q", DataTypes.ARRAY(DataTypes.STRING())),
                DataTypes.FIELD("r", DataTypes.ARRAY(DataTypes.TIMESTAMP(3))),
                DataTypes.FIELD("s", DataTypes.ARRAY(DataTypes.INT())),
                DataTypes.FIELD(
                    "t", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))),
                DataTypes.FIELD("u", row_type)
            ]))

        exec_insert_table(
            t.select("tinyint_func(a),"
                     "smallint_func(b),"
                     "int_func(c),"
                     "bigint_func(d),"
                     "boolean_func(e),"
                     "boolean_func(f),"
                     "float_func(g),"
                     "double_func(h),"
                     "varchar_func(i),"
                     "varchar_func(j),"
                     "varbinary_func(k),"
                     "decimal_func(l),"
                     "decimal_func(m),"
                     "date_func(n),"
                     "time_func(o),"
                     "timestamp_func(p),"
                     "array_str_func(q),"
                     "array_timestamp_func(r),"
                     "array_int_func(s),"
                     "nested_array_func(t),"
                     "row_func(u)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,32767,-2147483648,1,true,false,1.0,1.0,hello,中文,"
            "[102, 108, 105, 110, 107],1000000000000000000.050000000000000000,"
            "1000000000000000000.059999999999999999,2014-09-13,01:00:01,"
            "1970-01-02 00:00:00.123,[hello, 中文, null],[1970-01-02 00:00:00.123],"
            "[1, 2],[hello, 中文, null],1,hello,1970-01-02 00:00:00.123,[1, 2]"
        ])
Esempio n. 11
0
 def decode_from_stream(self, in_stream, nested):
     return Row(*self._decode_one_row_from_stream(in_stream, nested))
Esempio n. 12
0
def identity(x):
    if x is not None:
        from pyflink.table import Row
        return Row(x)