def __init__(self, spec, keyed_state_backend): self._window = spec.serialized_fn.group_window self._named_property_extractor = self._create_named_property_function() self._is_time_window = None self._reuse_timer_data = Row() self._reuse_key_data = Row() super(StreamGroupWindowAggregateOperation, self).__init__(spec, keyed_state_backend)
def test_row_coder(self): from pyflink.table import Row field_coder = BigIntCoder() field_count = 10 coder = RowCoder([field_coder for _ in range(field_count)]) v = Row(*[None if i % 2 == 0 else i for i in range(field_count)]) self.check_coder(coder, v)
def process(self, windowed_value): results = [invoker.invoke_eval(windowed_value.value) for invoker in self.scalar_function_invokers] from pyflink.table import Row result = Row(*results) # send the execution results back self.output_processor.process_outputs(windowed_value, [result])
def decode_from_stream(self, in_stream, nested): null_mask = self.read_null_mask(len(self._field_coders), in_stream) assert len(null_mask) == len(self._field_coders) return Row(*[ None if null_mask[idx] else self._field_coders[idx]. decode_from_stream(in_stream, nested) for idx in range(0, len(null_mask)) ])
def test_from_element(self): t_env = self.t_env field_names = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s" ] field_types = [ DataTypes.BIGINT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(), DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(), DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()), DataTypes.ARRAY(DataTypes.DOUBLE()), DataTypes.ARRAY(DataTypes.DOUBLE(False)), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.DATE()), DataTypes.DECIMAL(10, 0), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ]), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()), DataTypes.BYTES(), ExamplePointUDT(), PythonOnlyUDT() ] schema = DataTypes.ROW( list( map( lambda field_name, field_type: DataTypes.FIELD( field_name, field_type), field_names, field_types))) table_sink = source_sink_utils.TestAppendSink(field_names, field_types) t_env.register_table_sink("Results", table_sink) t = t_env.from_elements( [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2), datetime.time(1, 0, 0), datetime.datetime( 1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0), datetime.timedelta(days=1, microseconds=10), [1.0, None], array.array("d", [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), { "key": 1.0 }, bytearray(b'ABCD'), ExamplePoint( 1.0, 2.0), PythonOnlyPoint(3.0, 4.0))], schema) t.insert_into("Results") self.env.execute() actual = source_sink_utils.results() expected = [ '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,' '1970-01-02 00:00:00.0,86400000010,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],' '1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],[3.0, 4.0]' ] self.assert_equals(actual, expected)
def test_cython_row_coder(self): from pyflink.table import Row field_count = 2 data = [Row(*[None if i % 2 == 0 else i for i in range(field_count)])] python_field_coders = [coder_impl.RowCoderImpl([coder_impl.BigIntCoderImpl() for _ in range(field_count)])] cython_field_coders = [coder_impl_fast.RowCoderImpl([coder_impl_fast.BigIntCoderImpl() for _ in range(field_count)])] self.check_cython_coder(python_field_coders, cython_field_coders, [data])
def test_blink_from_element(self): t_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) field_names = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q" ] field_types = [ DataTypes.BIGINT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3), DataTypes.INTERVAL(DataTypes.SECOND(3)), DataTypes.ARRAY(DataTypes.DOUBLE()), DataTypes.ARRAY(DataTypes.DOUBLE(False)), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.DATE()), DataTypes.DECIMAL(38, 18), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ]), DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()), DataTypes.BYTES(), PythonOnlyUDT() ] schema = DataTypes.ROW( list( map( lambda field_name, field_type: DataTypes.FIELD( field_name, field_type), field_names, field_types))) table_sink = source_sink_utils.TestAppendSink(field_names, field_types) t_env.register_table_sink("Results", table_sink) t = t_env.from_elements( [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2), datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0, 0), datetime.timedelta(days=1, microseconds=10), [1.0, None], array.array("d", [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), { "key": 1.0 }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema) t.insert_into("Results") t_env.execute("test") actual = source_sink_utils.results() expected = [ '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,' '86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],' '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]' ] self.assert_equals(actual, expected)
def test_from_element(self): t_env = self.t_env a = array.array('b') a.fromstring('ABCD') t = t_env.from_elements([ (1, 1.0, "hi", "hello", datetime.date(1970, 1, 2), datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0, 0), [1.0, None], array.array("d", [1.0, 2.0]), ["abc"], [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), { "key": 1.0 }, a, ExamplePoint(1.0, 2.0), PythonOnlyPoint(3.0, 4.0)) ]) field_names = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q" ] field_types = [ DataTypes.BIGINT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(), DataTypes.ARRAY(DataTypes.DOUBLE()), DataTypes.ARRAY(DataTypes.DOUBLE(False)), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.DATE()), DataTypes.DECIMAL(), DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ]), DataTypes.MAP(DataTypes.VARCHAR(), DataTypes.DOUBLE()), DataTypes.VARBINARY(), ExamplePointUDT(), PythonOnlyUDT() ] t_env.register_table_sink("Results", field_names, field_types, source_sink_utils.TestAppendSink()) t.insert_into("Results") t_env.exec_env().execute() actual = source_sink_utils.results() expected = [ '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,[1.0, null],' '[1.0, 2.0],[abc],[1970-01-02],1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],' '[3.0, 4.0]' ] self.assert_equals(actual, expected)
def decode_from_stream(self, in_stream, nested): return Row( *super(RowCoderImpl, self).decode_from_stream(in_stream, nested))
def test_all_data_types(self): import pandas as pd import numpy as np def tinyint_func(tinyint_param): assert isinstance(tinyint_param, pd.Series) assert isinstance(tinyint_param[0], np.int8), \ 'tinyint_param of wrong type %s !' % type(tinyint_param[0]) return tinyint_param def smallint_func(smallint_param): assert isinstance(smallint_param, pd.Series) assert isinstance(smallint_param[0], np.int16), \ 'smallint_param of wrong type %s !' % type(smallint_param[0]) assert smallint_param[ 0] == 32767, 'smallint_param of wrong value %s' % smallint_param return smallint_param def int_func(int_param): assert isinstance(int_param, pd.Series) assert isinstance(int_param[0], np.int32), \ 'int_param of wrong type %s !' % type(int_param[0]) assert int_param[ 0] == -2147483648, 'int_param of wrong value %s' % int_param return int_param def bigint_func(bigint_param): assert isinstance(bigint_param, pd.Series) assert isinstance(bigint_param[0], np.int64), \ 'bigint_param of wrong type %s !' % type(bigint_param[0]) return bigint_param def boolean_func(boolean_param): assert isinstance(boolean_param, pd.Series) assert isinstance(boolean_param[0], np.bool_), \ 'boolean_param of wrong type %s !' % type(boolean_param[0]) return boolean_param def float_func(float_param): assert isinstance(float_param, pd.Series) assert isinstance(float_param[0], np.float32), \ 'float_param of wrong type %s !' % type(float_param[0]) return float_param def double_func(double_param): assert isinstance(double_param, pd.Series) assert isinstance(double_param[0], np.float64), \ 'double_param of wrong type %s !' % type(double_param[0]) return double_param def varchar_func(varchar_param): assert isinstance(varchar_param, pd.Series) assert isinstance(varchar_param[0], str), \ 'varchar_param of wrong type %s !' % type(varchar_param[0]) return varchar_param def varbinary_func(varbinary_param): assert isinstance(varbinary_param, pd.Series) assert isinstance(varbinary_param[0], bytes), \ 'varbinary_param of wrong type %s !' % type(varbinary_param[0]) return varbinary_param def decimal_func(decimal_param): assert isinstance(decimal_param, pd.Series) assert isinstance(decimal_param[0], decimal.Decimal), \ 'decimal_param of wrong type %s !' % type(decimal_param[0]) return decimal_param def date_func(date_param): assert isinstance(date_param, pd.Series) assert isinstance(date_param[0], datetime.date), \ 'date_param of wrong type %s !' % type(date_param[0]) return date_param def time_func(time_param): assert isinstance(time_param, pd.Series) assert isinstance(time_param[0], datetime.time), \ 'time_param of wrong type %s !' % type(time_param[0]) return time_param timestamp_value = datetime.datetime(1970, 1, 2, 0, 0, 0, 123000) def timestamp_func(timestamp_param): assert isinstance(timestamp_param, pd.Series) assert isinstance(timestamp_param[0], datetime.datetime), \ 'timestamp_param of wrong type %s !' % type(timestamp_param[0]) assert timestamp_param[0] == timestamp_value, \ 'timestamp_param is wrong value %s, should be %s!' % (timestamp_param[0], timestamp_value) return timestamp_param def array_func(array_param): assert isinstance(array_param, pd.Series) assert isinstance(array_param[0], np.ndarray), \ 'array_param of wrong type %s !' % type(array_param[0]) return array_param def nested_array_func(nested_array_param): assert isinstance(nested_array_param, pd.Series) assert isinstance(nested_array_param[0], np.ndarray), \ 'nested_array_param of wrong type %s !' % type(nested_array_param[0]) return pd.Series(nested_array_param[0]) def row_func(row_param): assert isinstance(row_param, pd.Series) assert isinstance(row_param[0], dict), \ 'row_param of wrong type %s !' % type(row_param[0]) return row_param self.t_env.create_temporary_system_function( "tinyint_func", udf(tinyint_func, result_type=DataTypes.TINYINT(), udf_type="pandas")) self.t_env.create_temporary_system_function( "smallint_func", udf(smallint_func, result_type=DataTypes.SMALLINT(), udf_type="pandas")) self.t_env.create_temporary_system_function( "int_func", udf(int_func, result_type=DataTypes.INT(), udf_type="pandas")) self.t_env.create_temporary_system_function( "bigint_func", udf(bigint_func, result_type=DataTypes.BIGINT(), udf_type="pandas")) self.t_env.create_temporary_system_function( "boolean_func", udf(boolean_func, result_type=DataTypes.BOOLEAN(), udf_type="pandas")) self.t_env.create_temporary_system_function( "float_func", udf(float_func, result_type=DataTypes.FLOAT(), udf_type="pandas")) self.t_env.create_temporary_system_function( "double_func", udf(double_func, result_type=DataTypes.DOUBLE(), udf_type="pandas")) self.t_env.create_temporary_system_function( "varchar_func", udf(varchar_func, result_type=DataTypes.STRING(), udf_type="pandas")) self.t_env.create_temporary_system_function( "varbinary_func", udf(varbinary_func, result_type=DataTypes.BYTES(), udf_type="pandas")) self.t_env.register_function( "decimal_func", udf(decimal_func, result_type=DataTypes.DECIMAL(38, 18), udf_type="pandas")) self.t_env.create_temporary_system_function( "date_func", udf(date_func, result_type=DataTypes.DATE(), udf_type="pandas")) self.t_env.create_temporary_system_function( "time_func", udf(time_func, result_type=DataTypes.TIME(), udf_type="pandas")) self.t_env.create_temporary_system_function( "timestamp_func", udf(timestamp_func, result_type=DataTypes.TIMESTAMP(3), udf_type="pandas")) self.t_env.create_temporary_system_function( "array_str_func", udf(array_func, result_type=DataTypes.ARRAY(DataTypes.STRING()), udf_type="pandas")) self.t_env.create_temporary_system_function( "array_timestamp_func", udf(array_func, result_type=DataTypes.ARRAY(DataTypes.TIMESTAMP(3)), udf_type="pandas")) self.t_env.create_temporary_system_function( "array_int_func", udf(array_func, result_type=DataTypes.ARRAY(DataTypes.INT()), udf_type="pandas")) self.t_env.create_temporary_system_function( "nested_array_func", udf(nested_array_func, result_type=DataTypes.ARRAY(DataTypes.STRING()), udf_type="pandas")) row_type = DataTypes.ROW([ DataTypes.FIELD("f1", DataTypes.INT()), DataTypes.FIELD("f2", DataTypes.STRING()), DataTypes.FIELD("f3", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("f4", DataTypes.ARRAY(DataTypes.INT())) ]) self.t_env.create_temporary_system_function( "row_func", udf(row_func, result_type=row_type, udf_type="pandas")) table_sink = source_sink_utils.TestAppendSink([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u' ], [ DataTypes.TINYINT(), DataTypes.SMALLINT(), DataTypes.INT(), DataTypes.BIGINT(), DataTypes.BOOLEAN(), DataTypes.BOOLEAN(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING(), DataTypes.STRING(), DataTypes.BYTES(), DataTypes.DECIMAL(38, 18), DataTypes.DECIMAL(38, 18), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3), DataTypes.ARRAY(DataTypes.STRING()), DataTypes.ARRAY(DataTypes.TIMESTAMP(3)), DataTypes.ARRAY(DataTypes.INT()), DataTypes.ARRAY(DataTypes.STRING()), row_type ]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements( [(1, 32767, -2147483648, 1, True, False, 1.0, 1.0, 'hello', '中文', bytearray(b'flink'), decimal.Decimal('1000000000000000000.05'), decimal.Decimal( '1000000000000000000.05999999999999999899999999999'), datetime.date(2014, 9, 13), datetime.time(hour=1, minute=0, second=1), timestamp_value, ['hello', '中文', None], [timestamp_value], [1, 2], [[ 'hello', '中文', None ]], Row(1, 'hello', timestamp_value, [1, 2]))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("d", DataTypes.BIGINT()), DataTypes.FIELD("e", DataTypes.BOOLEAN()), DataTypes.FIELD("f", DataTypes.BOOLEAN()), DataTypes.FIELD("g", DataTypes.FLOAT()), DataTypes.FIELD("h", DataTypes.DOUBLE()), DataTypes.FIELD("i", DataTypes.STRING()), DataTypes.FIELD("j", DataTypes.STRING()), DataTypes.FIELD("k", DataTypes.BYTES()), DataTypes.FIELD("l", DataTypes.DECIMAL(38, 18)), DataTypes.FIELD("m", DataTypes.DECIMAL(38, 18)), DataTypes.FIELD("n", DataTypes.DATE()), DataTypes.FIELD("o", DataTypes.TIME()), DataTypes.FIELD("p", DataTypes.TIMESTAMP(3)), DataTypes.FIELD("q", DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD("r", DataTypes.ARRAY(DataTypes.TIMESTAMP(3))), DataTypes.FIELD("s", DataTypes.ARRAY(DataTypes.INT())), DataTypes.FIELD( "t", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.STRING()))), DataTypes.FIELD("u", row_type) ])) exec_insert_table( t.select("tinyint_func(a)," "smallint_func(b)," "int_func(c)," "bigint_func(d)," "boolean_func(e)," "boolean_func(f)," "float_func(g)," "double_func(h)," "varchar_func(i)," "varchar_func(j)," "varbinary_func(k)," "decimal_func(l)," "decimal_func(m)," "date_func(n)," "time_func(o)," "timestamp_func(p)," "array_str_func(q)," "array_timestamp_func(r)," "array_int_func(s)," "nested_array_func(t)," "row_func(u)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, [ "1,32767,-2147483648,1,true,false,1.0,1.0,hello,中文," "[102, 108, 105, 110, 107],1000000000000000000.050000000000000000," "1000000000000000000.059999999999999999,2014-09-13,01:00:01," "1970-01-02 00:00:00.123,[hello, 中文, null],[1970-01-02 00:00:00.123]," "[1, 2],[hello, 中文, null],1,hello,1970-01-02 00:00:00.123,[1, 2]" ])
def decode_from_stream(self, in_stream, nested): return Row(*self._decode_one_row_from_stream(in_stream, nested))
def identity(x): if x is not None: from pyflink.table import Row return Row(x)