Beispiel #1
0
 def test_cython_row_coder(self):
     from pyflink.common import Row, RowKind
     field_count = 2
     field_names = ['f{}'.format(i) for i in range(field_count)]
     row = Row(**{
         field_names[i]: None if i % 2 == 0 else i
         for i in range(field_count)
     })
     data = [row]
     python_field_coders = [
         coder_impl.RowCoderImpl(
             [coder_impl.BigIntCoderImpl() for _ in range(field_count)],
             field_names)
     ]
     cython_field_coders = [
         coder_impl_fast.RowCoderImpl([
             coder_impl_fast.BigIntCoderImpl() for _ in range(field_count)
         ], field_names)
     ]
     row.set_row_kind(RowKind.INSERT)
     self.check_cython_coder(python_field_coders, cython_field_coders, data)
     row.set_row_kind(RowKind.UPDATE_BEFORE)
     self.check_cython_coder(python_field_coders, cython_field_coders, data)
     row.set_row_kind(RowKind.UPDATE_AFTER)
     self.check_cython_coder(python_field_coders, cython_field_coders, data)
     row.set_row_kind(RowKind.DELETE)
     self.check_cython_coder(python_field_coders, cython_field_coders, data)
Beispiel #2
0
 def test_mixed_with_built_in_functions_without_retract(self):
     self.t_env.get_config().get_configuration().set_string("parallelism.default", "1")
     self.t_env.create_temporary_system_function(
         "concat",
         ConcatAggregateFunction())
     t = self.t_env.from_elements(
         [('Hi', 2),
          ('Hi', 4),
          (None, None),
          ('hello2', 8),
          ('hello', 10)], ['b', 'c'])
     self.t_env.create_temporary_view("source", t)
     result_table = self.t_env.sql_query(
         "select concat(b, ',') as a, "
         "FIRST_VALUE(b) as b, "
         "LAST_VALUE(b) as c, "
         "COUNT(c) as d, "
         "COUNT(1) as e, "
         "LISTAGG(b) as f,"
         "LISTAGG(b, '|') as g,"
         "MAX(c) as h,"
         "MAX(cast(c as float) + 1) as i,"
         "MIN(c) as j,"
         "MIN(cast(c as decimal) + 1) as k,"
         "SUM(c) as l,"
         "SUM(cast(c as float) + 1) as m "
         "from source")
     result = [i for i in result_table.execute().collect()]
     expected = Row('Hi,Hi,hello,hello2', 'Hi', 'hello', 4, 5, 'Hi,Hi,hello2,hello',
                    'Hi|Hi|hello2|hello', 10, 11.0, 2, Decimal(3.0), 24, 28.0)
     expected.set_row_kind(RowKind.UPDATE_AFTER)
     self.assertEqual(result[len(result) - 1], expected)
Beispiel #3
0
 def setUpClass(cls):
     super(PandasConversionTestBase, cls).setUpClass()
     cls.data = [(1, 1, 1, 1, True, 1.1, 1.2, 'hello', bytearray(b"aaa"),
                  decimal.Decimal('1000000000000000000.01'), datetime.date(2014, 9, 13),
                  datetime.time(hour=1, minute=0, second=1),
                  datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'],
                  Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000),
                      d=[1, 2])),
                 (1, 2, 2, 2, False, 2.1, 2.2, 'world', bytearray(b"bbb"),
                  decimal.Decimal('1000000000000000000.02'), datetime.date(2014, 9, 13),
                  datetime.time(hour=1, minute=0, second=1),
                  datetime.datetime(1970, 1, 1, 0, 0, 0, 123000), ['hello', '中文'],
                  Row(a=1, b='hello', c=datetime.datetime(1970, 1, 1, 0, 0, 0, 123000),
                      d=[1, 2]))]
     cls.data_type = DataTypes.ROW(
         [DataTypes.FIELD("f1", DataTypes.TINYINT()),
          DataTypes.FIELD("f2", DataTypes.SMALLINT()),
          DataTypes.FIELD("f3", DataTypes.INT()),
          DataTypes.FIELD("f4", DataTypes.BIGINT()),
          DataTypes.FIELD("f5", DataTypes.BOOLEAN()),
          DataTypes.FIELD("f6", DataTypes.FLOAT()),
          DataTypes.FIELD("f7", DataTypes.DOUBLE()),
          DataTypes.FIELD("f8", DataTypes.STRING()),
          DataTypes.FIELD("f9", DataTypes.BYTES()),
          DataTypes.FIELD("f10", DataTypes.DECIMAL(38, 18)),
          DataTypes.FIELD("f11", DataTypes.DATE()),
          DataTypes.FIELD("f12", DataTypes.TIME()),
          DataTypes.FIELD("f13", DataTypes.TIMESTAMP(3)),
          DataTypes.FIELD("f14", DataTypes.ARRAY(DataTypes.STRING())),
          DataTypes.FIELD("f15", DataTypes.ROW(
              [DataTypes.FIELD("a", DataTypes.INT()),
               DataTypes.FIELD("b", DataTypes.STRING()),
               DataTypes.FIELD("c", DataTypes.TIMESTAMP(3)),
               DataTypes.FIELD("d", DataTypes.ARRAY(DataTypes.INT()))]))], False)
     cls.pdf = cls.create_pandas_data_frame()
    def test_map(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT())]))

        sink_table_ddl = """
        CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        func = udf(lambda x: Row(a=x + 1, b=x * x), result_type=DataTypes.ROW(
            [DataTypes.FIELD("a", DataTypes.BIGINT()),
             DataTypes.FIELD("b", DataTypes.BIGINT())]))

        func2 = udf(lambda x: Row(x.a + 1, x.b * 2), result_type=DataTypes.ROW(
            [DataTypes.FIELD("a", DataTypes.BIGINT()),
             DataTypes.FIELD("b", DataTypes.BIGINT())]))

        t.map(func(t.b)).alias("a", "b") \
            .map(func(t.a)) \
            .map(func2) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual, ["+I[5, 18]", "+I[4, 8]", "+I[8, 72]", "+I[11, 162]", "+I[6, 32]"])
    def test_map(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        func = udf(lambda x: Row(a=x + 1, b=x * x),
                   result_type=DataTypes.ROW([
                       DataTypes.FIELD("a", DataTypes.BIGINT()),
                       DataTypes.FIELD("b", DataTypes.BIGINT())
                   ]))

        func2 = udf(lambda x: Row(x.a + 1, x.b * 2),
                    result_type=DataTypes.ROW([
                        DataTypes.FIELD("a", DataTypes.BIGINT()),
                        DataTypes.FIELD("b", DataTypes.BIGINT())
                    ]))

        t.map(func(t.b)).alias("a", "b") \
            .map(func(t.a)) \
            .map(func2) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[5, 18]", "+I[4, 8]", "+I[8, 72]", "+I[11, 162]", "+I[6, 32]"])
def batch_seq_num_test():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(2)
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)

    seq_num_source = NumberSequenceSource(1, 1000)

    output_path = '/opt/examples/output/batch_seq_num'
    file_sink = FileSink \
        .for_row_format(output_path, Encoder.simple_string_encoder()) \
        .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \
        .build()

    ds = env.from_source(
        source=seq_num_source,
        watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
        source_name='file_source',
        type_info=Types.LONG())

    ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
        .key_by(lambda a: a[0]) \
        .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \
        .sink_to(file_sink)

    env.execute('9-data_stream_batch_seq_num')
Beispiel #7
0
class RowDataConverter(DataConverter):
    def __init__(self, field_data_converters: List[DataConverter],
                 field_names: List[str]):
        self._field_data_converters = field_data_converters
        self._reuse_row = Row()
        self._reuse_external_row_data = [
            None for _ in range(len(field_data_converters))
        ]
        self._reuse_external_row = [None, self._reuse_external_row_data]
        self._reuse_row.set_field_names(field_names)

    def to_internal(self, value) -> IN:
        if value is None:
            return None

        self._reuse_row._values = [
            self._field_data_converters[i].to_internal(item)
            for i, item in enumerate(value[1])
        ]
        self._reuse_row.set_row_kind(RowKind(value[0]))

        return self._reuse_row

    def to_external(self, value: Row) -> OUT:
        if value is None:
            return None

        self._reuse_external_row[0] = value.get_row_kind().value
        values = value._values
        for i in range(len(values)):
            self._reuse_external_row_data[i] = self._field_data_converters[
                i].to_external(values[i])
        return self._reuse_external_row
Beispiel #8
0
    def setUp(self):
        super(StringIndexerTest, self).setUp()
        self.train_table = self.t_env.from_data_stream(
            self.env.from_collection([
                ('a', 1.0),
                ('b', 1.0),
                ('b', 2.0),
                ('c', 0.0),
                ('d', 2.0),
                ('a', 2.0),
                ('b', 2.0),
                ('b', -1.0),
                ('a', -1.0),
                ('c', -1.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['input_col1', 'input_col2'],
                    [Types.STRING(), Types.DOUBLE()])))

        self.predict_table = self.t_env.from_data_stream(
            self.env.from_collection([
                ('a', 2.0),
                ('b', 1.0),
                ('e', 2.0),
            ],
                type_info=Types.ROW_NAMED(
                    ['input_col1', 'input_col2'],
                    [Types.STRING(), Types.DOUBLE()])))

        self.expected_alphabetic_asc_predict_data = [
            Row('a', 2.0, 0, 3),
            Row('b', 1.0, 1, 2),
            Row('e', 2.0, 4, 3)
        ]
def data_stream_batch_test():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(2)
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)

    input_path = '/opt/examples/data/word_count_input'
    output_path = '/opt/examples/output/data_stream_batch'

    file_source = FileSource\
        .for_record_stream_format(
            StreamFormat.text_line_format(),
            input_path) \
        .process_static_file_set() \
        .build()

    file_sink = FileSink \
        .for_row_format(output_path, Encoder.simple_string_encoder()) \
        .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \
        .build()

    ds = env.from_source(
        source=file_source,
        watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
        source_name='file_source',
        type_info=Types.STRING())

    ds.map(lambda a: Row(a, 1), output_type=Types.ROW([Types.STRING(), Types.INT()])) \
        .key_by(lambda a: a[0]) \
        .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \
        .sink_to(file_sink)

    env.execute('8-data_stream_batch')
Beispiel #10
0
 def test_cython_row_coder(self):
     from pyflink.common import Row, RowKind
     field_count = 2
     row = Row(*[None if i % 2 == 0 else i for i in range(field_count)])
     data = [row]
     python_field_coders = [
         coder_impl.RowCoderImpl(
             [coder_impl.BigIntCoderImpl() for _ in range(field_count)])
     ]
     cython_field_coders = [
         coder_impl_fast.RowCoderImpl([
             coder_impl_fast.BigIntCoderImpl() for _ in range(field_count)
         ])
     ]
     row.set_row_kind(RowKind.INSERT)
     self.check_cython_coder(python_field_coders, cython_field_coders,
                             [data])
     row.set_row_kind(RowKind.UPDATE_BEFORE)
     self.check_cython_coder(python_field_coders, cython_field_coders,
                             [data])
     row.set_row_kind(RowKind.UPDATE_AFTER)
     self.check_cython_coder(python_field_coders, cython_field_coders,
                             [data])
     row.set_row_kind(RowKind.DELETE)
     self.check_cython_coder(python_field_coders, cython_field_coders,
                             [data])
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)),
             (2, Row(3, 4))],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD(
                    "b",
                    DataTypes.ROW([
                        DataTypes.FIELD("c", DataTypes.INT()),
                        DataTypes.FIELD("d", DataTypes.INT())
                    ]))
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        def func3(x):
            assert isinstance(x, Row)
            return x

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW([
                             DataTypes.FIELD("c", DataTypes.BIGINT()),
                             DataTypes.FIELD("d", DataTypes.BIGINT())
                         ]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())
                           ]),
                           func_type='pandas')

        general_udf = udf(func3,
                          result_type=DataTypes.ROW([
                              DataTypes.FIELD("c", DataTypes.BIGINT()),
                              DataTypes.FIELD("d", DataTypes.BIGINT())
                          ]))

        t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert(
            "Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
Beispiel #12
0
 def __init__(self, field_data_converters: List[DataConverter],
              field_names: List[str]):
     self._field_data_converters = field_data_converters
     self._reuse_row = Row()
     self._reuse_external_row_data = [
         None for _ in range(len(field_data_converters))
     ]
     self._reuse_external_row = [None, self._reuse_external_row_data]
     self._reuse_row.set_field_names(field_names)
Beispiel #13
0
    def encode_to_stream(self, value: Row, out_stream: OutputStream):
        # encode mask value
        values = value.get_fields_by_names(self._field_names)
        self._mask_utils.write_mask(values,
                                    value.get_row_kind().value, out_stream)

        # encode every field value
        for i in range(self._field_count):
            item = values[i]
            if item is not None:
                self._field_coders[i].encode_to_stream(item, out_stream)
Beispiel #14
0
    def _emit_output(self, output_result):
        for result in output_result:
            yield Row(None, None, None, result)

        for result in self._collector.buf:
            # 0: proc time timer data
            # 1: event time timer data
            # 2: normal data
            # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA]
            yield Row(result[0], result[1], result[2], None)

        self._collector.clear()
Beispiel #15
0
 def wrapped_func(value):
     if value[0]:
         result = co_flat_map_func.flat_map1(value[1])
         if result:
             for result_val in result:
                 yield Row(CoFlatMapFunctionOutputFlag.LEFT.value, result_val)
         yield Row(CoFlatMapFunctionOutputFlag.LEFT_END.value, None)
     else:
         result = co_flat_map_func.flat_map2(value[2])
         if result:
             for result_val in result:
                 yield Row(CoFlatMapFunctionOutputFlag.RIGHT.value, result_val)
         yield Row(CoFlatMapFunctionOutputFlag.RIGHT_END.value, None)
Beispiel #16
0
def _emit_results(timestamp, watermark, results, has_side_output):
    if results:
        if has_side_output:
            for result in results:
                if isinstance(result, tuple) and isinstance(
                        result[0], OutputTag):
                    yield cast(OutputTag,
                               result[0]).tag_id, Row(timestamp, watermark,
                                                      result[1])
                else:
                    yield DEFAULT_OUTPUT_TAG, Row(timestamp, watermark, result)
        else:
            for result in results:
                yield Row(timestamp, watermark, result)
Beispiel #17
0
 def test_row_coder(self):
     from pyflink.common import Row, RowKind
     field_coder = BigIntCoder()
     field_count = 10
     coder = RowCoder([field_coder for _ in range(field_count)])
     v = Row(*[None if i % 2 == 0 else i for i in range(field_count)])
     v.set_row_kind(RowKind.INSERT)
     self.check_coder(coder, v)
     v.set_row_kind(RowKind.UPDATE_BEFORE)
     self.check_coder(coder, v)
     v.set_row_kind(RowKind.UPDATE_AFTER)
     self.check_coder(coder, v)
     v.set_row_kind(RowKind.DELETE)
     self.check_coder(coder, v)
Beispiel #18
0
 def test_row_coder(self):
     from pyflink.common import Row, RowKind
     field_coder = BigIntCoder()
     field_count = 10
     field_names = ['f{}'.format(i) for i in range(field_count)]
     coder = RowCoder([field_coder for _ in range(field_count)], field_names)
     v = Row(**{field_names[i]: None if i % 2 == 0 else i for i in range(field_count)})
     v.set_row_kind(RowKind.INSERT)
     self.check_coder(coder, v)
     v.set_row_kind(RowKind.UPDATE_BEFORE)
     self.check_coder(coder, v)
     v.set_row_kind(RowKind.UPDATE_AFTER)
     self.check_coder(coder, v)
     v.set_row_kind(RowKind.DELETE)
     self.check_coder(coder, v)
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b",
                                 DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()),
                                                DataTypes.FIELD("d", DataTypes.INT())]))]))

        sink_table_ddl = """
        CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        def func3(x):
            assert isinstance(x, Row)
            return x

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW(
                             [DataTypes.FIELD("c", DataTypes.BIGINT()),
                              DataTypes.FIELD("d", DataTypes.BIGINT())]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("c", DataTypes.BIGINT()),
                                DataTypes.FIELD("d", DataTypes.BIGINT())]),
                           func_type='pandas')

        general_udf = udf(func3,
                          result_type=DataTypes.ROW(
                              [DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())]))

        t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
Beispiel #20
0
 def _encode_one_row_to_stream(self, value: Row, out_stream, nested):
     field_coders = self._field_coders
     self._write_mask(value, out_stream, value.get_row_kind().value)
     for i in range(self._field_count):
         item = value[i]
         if item is not None:
             field_coders[i].encode_to_stream(item, out_stream, nested)
def state_access_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)

    seq_num_source = NumberSequenceSource(1, 10)

    output_path = '/opt/examples/datastream/output/state_access'
    file_sink = FileSink \
        .for_row_format(output_path, Encoder.simple_string_encoder()) \
        .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \
        .build()

    ds = env.from_source(
        source=seq_num_source,
        watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
        source_name='seq_num_source',
        type_info=Types.LONG())

    ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
        .key_by(lambda a: a[0]) \
        .map(MyMapFunction(), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
        .key_by(lambda a: a[0]) \
        .process(MyKeyedProcessFunction(), Types.LONG()) \
        .sink_to(file_sink)

    env.execute('11-data_stream_state_access')
Beispiel #22
0
def wrap_inputs_as_row(*args):
    from pyflink.common.types import Row
    import pandas as pd
    if type(args[0]) == pd.Series:
        return pd.concat(args, axis=1)
    else:
        return Row(*args)
Beispiel #23
0
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('bytes', DataTypes.BYTES()),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')),
    ])
    row_type_info = Types.ROW_NAMED(
        ['char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double',
         'date', 'timestamp'],
        [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(),
         Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(),
         Types.SQL_TIMESTAMP()]
    )
    data = [Row(
        char='char',
        varchar='varchar',
        bytes=b'varbinary',
        boolean=True,
        decimal=Decimal(1.5),
        int=2147483647,
        bigint=-9223372036854775808,
        double=2e-308,
        date=date(1970, 1, 1),
        timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000),
    )]
    return row_type, row_type_info, data
    def test_aggregate_with_pandas_udaf_without_keys(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.FLOAT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: Row(pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW([
                               DataTypes.FIELD("a", DataTypes.FLOAT()),
                               DataTypes.FIELD("b", DataTypes.INT())
                           ]),
                           func_type="pandas")
        t.select(t.b) \
            .aggregate(pandas_udaf.alias("a", "b")) \
            .select("a, b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[3.8, 8]"])
Beispiel #25
0
    def test_map(self):
        t = self.t_env.from_elements(
            [(1, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)],
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.TINYINT()),
                DataTypes.FIELD("b", DataTypes.SMALLINT()),
                DataTypes.FIELD("c", DataTypes.INT())
            ]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        func = udf(lambda x: Row(x + 1, x * x),
                   result_type=DataTypes.ROW([
                       DataTypes.FIELD("a", DataTypes.BIGINT()),
                       DataTypes.FIELD("b", DataTypes.BIGINT())
                   ]))

        t.map(func(t.b)).alias("a", "b") \
            .map(func(t.a)).alias("a", "b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["4,9", "3,4", "7,36", "10,81", "5,16"])
Beispiel #26
0
 def partition_custom_map(self, value):
     if self.num_partitions is None:
         self.num_partitions = int(
             os.environ[data_stream_num_partitions_env_key])
     partition = partitioner.partition(key_selector.get_key(value),
                                       self.num_partitions)
     return Row(partition, value)
Beispiel #27
0
def join_row(left: Row, right: Row):
    fields = []
    for value in left:
        fields.append(value)
    for value in right:
        fields.append(value)
    return Row(*fields)
Beispiel #28
0
def convert_to_python_obj(data, type_info):
    if type_info == Types.PICKLED_BYTE_ARRAY():
        return pickle.loads(data)
    elif isinstance(type_info, ExternalTypeInfo):
        return convert_to_python_obj(data, type_info._type_info)
    else:
        gateway = get_gateway()
        pickle_bytes = gateway.jvm.PythonBridgeUtils. \
            getPickledBytesFromJavaObject(data, type_info.get_java_type_info())
        if isinstance(type_info, RowTypeInfo) or isinstance(
                type_info, TupleTypeInfo):
            field_data = zip(list(pickle_bytes[1:]),
                             type_info.get_field_types())
            fields = []
            for data, field_type in field_data:
                if len(data) == 0:
                    fields.append(None)
                else:
                    fields.append(
                        pickled_bytes_to_python_converter(data, field_type))
            if isinstance(type_info, RowTypeInfo):
                return Row.of_kind(
                    RowKind(int.from_bytes(pickle_bytes[0], 'little')),
                    *fields)
            else:
                return tuple(fields)
        else:
            return pickled_bytes_to_python_converter(pickle_bytes, type_info)
Beispiel #29
0
        def wrapped_keyed_process_function(value):
            if value[0] is not None:
                # it is timer data
                # VALUE:
                # TIMER_FLAG, TIMESTAMP_OF_TIMER, CURRENT_WATERMARK, CURRENT_KEY_OF_TIMER, None
                on_timer_ctx.set_timestamp(value[1])
                on_timer_ctx.timer_service().set_current_watermark(value[2])
                state_current_key = value[3]
                user_current_key = state_current_key[0]
                on_timer_ctx.set_current_key(user_current_key)
                keyed_state_backend.set_current_key(state_current_key)
                if value[
                        0] == KeyedProcessFunctionInputFlag.EVENT_TIME_TIMER.value:
                    on_timer_ctx.set_time_domain(TimeDomain.EVENT_TIME)
                elif value[
                        0] == KeyedProcessFunctionInputFlag.PROC_TIME_TIMER.value:
                    on_timer_ctx.set_time_domain(TimeDomain.PROCESSING_TIME)
                else:
                    raise TypeError(
                        "TimeCharacteristic[%s] is not supported." %
                        str(value[0]))
                output_result = on_timer(value[1], on_timer_ctx)
            else:
                # it is normal data
                # VALUE: TIMER_FLAG, CURRENT_TIMESTAMP, CURRENT_WATERMARK, None, NORMAL_DATA
                # NORMAL_DATA: CURRENT_KEY, DATA
                ctx.set_timestamp(value[1])
                ctx.timer_service().set_current_watermark(value[2])
                user_current_key = value[4][0]
                state_current_key = Row(user_current_key)
                ctx.set_current_key(user_current_key)
                keyed_state_backend.set_current_key(state_current_key)

                output_result = process_element(value[4][1], ctx)

            if output_result:
                for result in output_result:
                    yield Row(None, None, None, result)

            for result in collector.buf:
                # 0: proc time timer data
                # 1: event time timer data
                # 2: normal data
                # result_row: [TIMER_FLAG, TIMER TYPE, TIMER_KEY, RESULT_DATA]
                yield Row(result[0], result[1], result[2], None)

            collector.clear()
Beispiel #30
0
 def test_mixed_with_built_in_functions_with_retract(self):
     self.t_env.get_config().get_configuration().set_string("parallelism.default", "1")
     self.t_env.create_temporary_system_function(
         "concat",
         ConcatAggregateFunction())
     t = self.t_env.from_elements(
         [(1, 'Hi_', 1),
          (1, 'Hi', 2),
          (2, 'Hi_', 3),
          (2, 'Hi', 4),
          (3, None, None),
          (3, None, None),
          (4, 'hello2_', 7),
          (4, 'hello2', 8),
          (5, 'hello_', 9),
          (5, 'hello', 10)], ['a', 'b', 'c'])
     self.t_env.create_temporary_view("source", t)
     table_with_retract_message = self.t_env.sql_query(
         "select a, LAST_VALUE(b) as b, LAST_VALUE(c) as c from source group by a")
     self.t_env.create_temporary_view("retract_table", table_with_retract_message)
     result_table = self.t_env.sql_query(
         "select concat(b, ',') as a, "
         "FIRST_VALUE(b) as b, "
         "LAST_VALUE(b) as c, "
         "COUNT(c) as d, "
         "COUNT(1) as e, "
         "LISTAGG(b) as f,"
         "LISTAGG(b, '|') as g,"
         "MAX(c) as h,"
         "MAX(cast(c as float) + 1) as i,"
         "MIN(c) as j,"
         "MIN(cast(c as decimal) + 1) as k,"
         "SUM(c) as l,"
         "SUM(cast(c as float) + 1) as m,"
         "AVG(c) as n,"
         "AVG(cast(c as double) + 1) as o,"
         "STDDEV_POP(cast(c as float)),"
         "STDDEV_SAMP(cast(c as float)),"
         "VAR_POP(cast(c as float)),"
         "VAR_SAMP(cast(c as float))"
         " from retract_table")
     result = [i for i in result_table.execute().collect()]
     expected = Row('Hi,Hi,hello,hello2', 'Hi', 'hello', 4, 5, 'Hi,Hi,hello2,hello',
                    'Hi|Hi|hello2|hello', 10, 11.0, 2, Decimal(3.0), 24, 28.0, 6, 7.0,
                    3.1622777, 3.6514838, 10.0, 13.333333)
     expected.set_row_kind(RowKind.UPDATE_AFTER)
     self.assertEqual(result[len(result) - 1], expected)