Beispiel #1
0
    def test_window_aggregate_accumulator_type(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[int, str]:
                return 0, ''

            def add(self, value: Tuple[str, int],
                    accumulator: Tuple[int, str]) -> Tuple[int, str]:
                return value[1] + accumulator[0], value[0]

            def get_result(self, accumulator: Tuple[str, int]):
                return accumulator[1], accumulator[0]

            def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]):
                return acc_a[0] + acc_b[0], acc_a[1]

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]),
                       output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_accumulator_type')
        results = self.test_sink.get_results()
        expected = ['(a,15)', '(a,3)', '(a,6)', '(b,17)', '(b,3)']
        self.assert_equals_sorted(expected, results)
Beispiel #2
0
    def test_reducing_state(self):
        self.env.set_parallelism(2)
        data_stream = self.env.from_collection(
            [(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'),
             (6, 'hello')],
            type_info=Types.TUPLE([Types.INT(), Types.STRING()]))

        class MyProcessFunction(KeyedProcessFunction):
            def __init__(self):
                self.reducing_state = None  # type: ReducingState

            def open(self, runtime_context: RuntimeContext):
                self.reducing_state = runtime_context.get_reducing_state(
                    ReducingStateDescriptor('reducing_state',
                                            lambda i, i2: i + i2, Types.INT()))

            def process_element(self, value, ctx):
                self.reducing_state.add(value[0])
                yield Row(self.reducing_state.get(), value[1])

        data_stream.key_by(lambda x: x[1], key_type_info=Types.STRING()) \
            .process(MyProcessFunction(), output_type=Types.TUPLE([Types.INT(), Types.STRING()])) \
            .add_sink(self.test_sink)
        self.env.execute('test_reducing_state')
        result = self.test_sink.get_results()
        expected_result = [
            '(1,hi)', '(2,hello)', '(4,hi)', '(6,hello)', '(9,hi)',
            '(12,hello)'
        ]
        result.sort()
        expected_result.sort()
        self.assertEqual(expected_result, result)
Beispiel #3
0
    def test_global_window_with_purging_trigger(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection(
            [('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1),
             ('hi', 1)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyProcessFunction(ProcessWindowFunction):
            def clear(self, context: ProcessWindowFunction.Context) -> None:
                pass

            def process(
                    self, key, context: ProcessWindowFunction.Context,
                    elements: Iterable[Tuple[str, int]]) -> Iterable[tuple]:
                return [(key, len([e for e in elements]))]

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(GlobalWindows.create()) \
            .trigger(PurgingTrigger.of(CountTrigger.of(2))) \
            .process(MyProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_global_window_with_purging_trigger')
        results = self.test_sink.get_results()
        expected = ['(hi,2)', '(hi,2)', '(hi,2)']
        self.assert_equals_sorted(expected, results)
Beispiel #4
0
    def test_window_aggregate_passthrough(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[str, Dict[int, int]]:
                return '', {0: 0, 1: 0}

            def add(
                self, value: Tuple[str, int], accumulator: Tuple[str,
                                                                 Dict[int,
                                                                      int]]
            ) -> Tuple[str, Dict[int, int]]:
                number_map = accumulator[1]
                number_map[value[1] % 2] += 1
                return value[0], number_map

            def get_result(
                    self, accumulator: Tuple[str,
                                             Dict[int,
                                                  int]]) -> Tuple[str, int]:
                number_map = accumulator[1]
                return accumulator[0], number_map[0] - number_map[1]

            def merge(
                self, acc_a: Tuple[str, Dict[int, int]],
                acc_b: Tuple[str, Dict[int,
                                       int]]) -> Tuple[str, Dict[int, int]]:
                number_map_a = acc_a[1]
                number_map_b = acc_b[1]
                new_number_map = {
                    0: number_map_a[0] + number_map_b[0],
                    1: number_map_a[1] + number_map_b[1]
                }
                return acc_a[0], new_number_map

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_passthrough')
        results = self.test_sink.get_results()
        expected = ['(a,-1)', '(a,0)', '(a,1)', '(b,-1)', '(b,0)']
        self.assert_equals_sorted(expected, results)
Beispiel #5
0
    def test_count_sliding_window(self):
        data_stream = self.env.from_collection([
            (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello')],
            type_info=Types.TUPLE([Types.INT(), Types.STRING()]))  # type: DataStream
        data_stream.key_by(lambda x: x[1], key_type=Types.STRING()) \
            .window(CountSlidingWindowAssigner(2, 1)) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_count_sliding_window')
        results = self.test_sink.get_results()
        expected = ['(hello,6)', '(hi,8)', '(hi,4)', '(hello,10)']
        self.assert_equals_sorted(expected, results)
Beispiel #6
0
    def test_window_aggregate_process(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[int, str]:
                return 0, ''

            def add(self, value: Tuple[str, int],
                    accumulator: Tuple[int, str]) -> Tuple[int, str]:
                return value[1] + accumulator[0], value[0]

            def get_result(self, accumulator: Tuple[str, int]):
                return accumulator[1], accumulator[0]

            def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]):
                return acc_a[0] + acc_b[0], acc_a[1]

        class MyProcessWindowFunction(ProcessWindowFunction):
            def process(self, key: str, context: ProcessWindowFunction.Context,
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                agg_result = next(iter(elements))
                yield "key {} timestamp sum {}".format(agg_result[0],
                                                       agg_result[1])

            def clear(self, context: ProcessWindowFunction.Context) -> None:
                pass

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       window_function=MyProcessWindowFunction(),
                       accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]),
                       output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_accumulator_type')
        results = self.test_sink.get_results()
        expected = [
            'key a timestamp sum 15', 'key a timestamp sum 3',
            'key a timestamp sum 6', 'key b timestamp sum 17',
            'key b timestamp sum 3'
        ]
        self.assert_equals_sorted(expected, results)
Beispiel #7
0
    def test_tuple_type(self):
        self.assertEqual(TupleTypeInfo([Types.STRING(), Types.INT()]),
                         TupleTypeInfo([Types.STRING(), Types.INT()]), True)

        self.assertEqual(TupleTypeInfo([Types.STRING(), Types.INT()]).__str__(),
                         "TupleTypeInfo(String, Integer)")

        self.assertNotEqual(TupleTypeInfo([Types.STRING(), Types.INT()]),
                            TupleTypeInfo([Types.STRING(), Types.BOOLEAN()]))

        self.assertEqual(Types.TUPLE([Types.STRING(), Types.INT()]),
                         TupleTypeInfo([Types.STRING(), Types.INT()]))

        self.assertEqual(Types.TUPLE([Types.STRING(), Types.INT()]).get_field_types(),
                         [Types.STRING(), Types.INT()])
Beispiel #8
0
    def test_side_output_late_data(self):
        self.env.set_parallelism(1)
        config = Configuration(j_configuration=get_j_env_configuration(
            self.env._j_stream_execution_environment))
        config.set_integer('python.fn-execution.bundle.size', 1)
        jvm = get_gateway().jvm
        watermark_strategy = WatermarkStrategy(
            jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy.
            forGenerator(jvm.org.apache.flink.streaming.api.functions.python.
                         eventtime.PerElementWatermarkGenerator.getSupplier())
        ).with_timestamp_assigner(SecondColumnTimestampAssigner())

        tag = OutputTag('late-data',
                        type_info=Types.ROW([Types.STRING(),
                                             Types.INT()]))
        ds1 = self.env.from_collection(
            [('a', 0), ('a', 8), ('a', 4), ('a', 6)],
            type_info=Types.ROW([Types.STRING(), Types.INT()]))
        ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda e: e[0]) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .allowed_lateness(0) \
            .side_output_late_data(tag) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()]))
        main_sink = DataStreamTestSinkFunction()
        ds2.add_sink(main_sink)
        side_sink = DataStreamTestSinkFunction()
        ds2.get_side_output(tag).add_sink(side_sink)

        self.env.execute('test_side_output_late_data')
        main_expected = ['(a,0,5,1)', '(a,5,10,2)']
        self.assert_equals_sorted(main_expected, main_sink.get_results())
        side_expected = ['+I[a, 4]']
        self.assert_equals_sorted(side_expected, side_sink.get_results())
Beispiel #9
0
def event_timer_timer_demo():
    env = StreamExecutionEnvironment.get_execution_environment()

    ds = env.from_collection(
        collection=[
            (1000, 'Alice', 110.1),
            (4000, 'Bob', 30.2),
            (3000, 'Alice', 20.0),
            (2000, 'Bob', 53.1),
            (5000, 'Alice', 13.1),
            (3000, 'Bob', 3.1),
            (7000, 'Bob', 16.1),
            (10000, 'Alice', 20.1)
        ],
        type_info=Types.TUPLE([Types.LONG(), Types.STRING(), Types.FLOAT()]))

    ds = ds.assign_timestamps_and_watermarks(
        WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(2))
                         .with_timestamp_assigner(MyTimestampAssigner()))

    # apply the process function onto a keyed stream
    ds.key_by(lambda value: value[1]) \
      .process(Sum()) \
      .print()

    # submit for execution
    env.execute()
Beispiel #10
0
    def test_window_all_reduce_process(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyProcessFunction(ProcessAllWindowFunction):
            def process(self, context: 'ProcessAllWindowFunction.Context',
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                yield "current window start at {}, reduce result {}".format(
                    context.window().start,
                    next(iter(elements)),
                )

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .window_all(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (a[0], a[1] + b[1]),
                    window_function=MyProcessFunction(),
                    output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_window_all_reduce_process')
        results = self.test_sink.get_results()
        expected = [
            "current window start at 1, reduce result ('a', 6)",
            "current window start at 6, reduce result ('a', 23)",
            "current window start at 15, reduce result ('a', 15)"
        ]
        self.assert_equals_sorted(expected, results)
Beispiel #11
0
 def test_project(self):
     ds = self.env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]],
                                   type_info=Types.TUPLE(
                                       [Types.INT(), Types.INT(), Types.INT(), Types.INT()]))
     ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink(self.test_sink)
     exec_plan = eval(self.env.get_execution_plan())
     self.assertEqual(exec_plan['nodes'][1]['type'], 'Projection')
Beispiel #12
0
    def test_session_window_late_merge(self):
        data_stream = self.env.from_collection([
            ('hi', 0), ('hi', 8), ('hi', 4)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_session_window_late_merge')
        results = self.test_sink.get_results()
        expected = ['(hi,3)']
        self.assert_equals_sorted(expected, results)
Beispiel #13
0
    def test_window_reduce_passthrough(self):
        data_stream = self.env.from_collection([
            ('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (b[0], a[1] + b[1]),
                    output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_reduce_passthrough')
        results = self.test_sink.get_results()
        expected = ['(a,3)', '(a,6)', '(a,15)', '(b,3)', '(b,17)']
        self.assert_equals_sorted(expected, results)
Beispiel #14
0
    def test_aggregating_state(self):
        self.env.set_parallelism(2)
        data_stream = self.env.from_collection(
            [(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'),
             (6, 'hello')],
            type_info=Types.TUPLE([Types.INT(), Types.STRING()]))

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self):
                return 0

            def add(self, value, accumulator):
                return value + accumulator

            def get_result(self, accumulator):
                return accumulator

            def merge(self, acc_a, acc_b):
                return acc_a + acc_b

        class MyProcessFunction(KeyedProcessFunction):
            def __init__(self):
                self.aggregating_state = None  # type: AggregatingState

            def open(self, runtime_context: RuntimeContext):
                self.aggregating_state = runtime_context.get_aggregating_state(
                    AggregatingStateDescriptor('aggregating_state',
                                               MyAggregateFunction(),
                                               Types.INT()))

            def process_element(self, value, ctx):
                self.aggregating_state.add(value[0])
                yield Row(self.aggregating_state.get(), value[1])

        data_stream.key_by(lambda x: x[1], key_type_info=Types.STRING()) \
            .process(MyProcessFunction(), output_type=Types.TUPLE([Types.INT(), Types.STRING()])) \
            .add_sink(self.test_sink)
        self.env.execute('test_aggregating_state')
        result = self.test_sink.get_results()
        expected_result = [
            '(1,hi)', '(2,hello)', '(4,hi)', '(6,hello)', '(9,hi)',
            '(12,hello)'
        ]
        result.sort()
        expected_result.sort()
        self.assertEqual(expected_result, result)
Beispiel #15
0
    def test_event_time_tumbling_window(self):
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9),
            ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_tumbling_window')
        results = self.test_sink.get_results()
        expected = ['(hi,0,5,4)', '(hi,5,10,3)', '(hi,15,20,1)']
        self.assert_equals_sorted(expected, results)
Beispiel #16
0
    def test_event_time_dynamic_gap_session_window(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 9), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_dynamic_gap_session_window')
        results = self.test_sink.get_results()
        expected = ['(hi,3)', '(hi,4)']
        self.assert_equals_sorted(expected, results)
Beispiel #17
0
    def test_event_time_session_window_with_purging_trigger(self):
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(3))) \
            .trigger(PurgingTrigger.of(EventTimeTrigger.create())) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_session_window_with_purging_trigger')
        results = self.test_sink.get_results()
        expected = ['(hi,1,7,4)', '(hi,8,12,2)', '(hi,15,18,1)']
        self.assert_equals_sorted(expected, results)
Beispiel #18
0
    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
                                      Types.ROW_NAMED(
                                          ["a", "b", "c"],
                                          [Types.LONG(), Types.INT(), Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps()
            .with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(ds,
                                            Schema.new_builder()
                                                  .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                                  .watermark("rowtime", "SOURCE_WATERMARK()")
                                                  .build())
        self.assertEqual("""(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
                         table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view("t",
                                         ds,
                                         Schema.new_builder()
                                         .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)")
                                         .watermark("rowtime", "SOURCE_WATERMARK()")
                                         .build())

        result = self.t_env.execute_sql("SELECT "
                                        "c, SUM(b) "
                                        "FROM t "
                                        "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [item for item in
                               map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Beispiel #19
0
    def test_from_and_to_changelog_stream_event_time(self):
        from pyflink.table import Schema

        self.env.set_parallelism(1)
        ds = self.env.from_collection(
            [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
            Types.ROW([Types.LONG(), Types.INT(),
                       Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps(
            ).with_timestamp_assigner(MyTimestampAssigner()))

        changelog_stream = ds.map(lambda t: Row(t.f1, t.f2),
                                  Types.ROW([Types.INT(),
                                             Types.STRING()]))

        # derive physical columns and add a rowtime
        table = self.t_env.from_changelog_stream(
            changelog_stream,
            Schema.new_builder().column_by_metadata(
                "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                    "computed", str(col("f1").upper_case)).watermark(
                        "rowtime", str(source_watermark())).build())

        self.t_env.create_temporary_view("t", table)

        # access and reorder columns
        reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t")

        # write out the rowtime column with fully declared schema
        result = self.t_env.to_changelog_stream(
            reordered,
            Schema.new_builder().column(
                "f1", DataTypes.STRING()).column_by_metadata(
                    "rowtime",
                    DataTypes.TIMESTAMP_LTZ(3)).column_by_expression(
                        "ignored", str(col("f1").upper_case)).column(
                            "f0", DataTypes.INT()).build())

        # test event time window and field access
        result.key_by(lambda k: k.f1) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(A,47)', '(C,1000)', '(C,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
Beispiel #20
0
    def test_map_function_with_data_types(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=Types.TUPLE([Types.STRING(), Types.INT()]))

        def map_func(value):
            result = Row(value[0], len(value[0]), value[1])
            return result

        ds.map(map_func, output_type=Types.ROW([Types.STRING(), Types.INT(), Types.INT()]))\
            .add_sink(self.test_sink)
        self.env.execute('map_function_test')
        results = self.test_sink.get_results(False)
        expected = ['ab,2,1', 'bdc,3,2', 'cfgs,4,3', 'deeefg,6,4']
        expected.sort()
        results.sort()
        self.assertEqual(expected, results)
Beispiel #21
0
    def test_map_function_with_data_types(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.TUPLE(
                                          [Types.STRING(),
                                           Types.INT()]))

        def map_func(value):
            result = (value[0], len(value[0]), value[1])
            return result

        mapped_stream = ds.map(map_func,
                               type_info=Types.ROW(
                                   [Types.STRING(),
                                    Types.INT(),
                                    Types.INT()]))
        collect_util = DataStreamCollectUtil()
        collect_util.collect(mapped_stream)
        self.env.execute('map_function_test')
        results = collect_util.results()
        expected = ['ab,2,1', 'bdc,3,2', 'cfgs,4,3', 'deeefg,6,4']
        expected.sort()
        results.sort()
        self.assertEqual(expected, results)
Beispiel #22
0
    def test_from_java_type(self):
        basic_int_type_info = Types.INT()
        self.assertEqual(basic_int_type_info,
                         _from_java_type(basic_int_type_info.get_java_type_info()))

        basic_short_type_info = Types.SHORT()
        self.assertEqual(basic_short_type_info,
                         _from_java_type(basic_short_type_info.get_java_type_info()))

        basic_long_type_info = Types.LONG()
        self.assertEqual(basic_long_type_info,
                         _from_java_type(basic_long_type_info.get_java_type_info()))

        basic_float_type_info = Types.FLOAT()
        self.assertEqual(basic_float_type_info,
                         _from_java_type(basic_float_type_info.get_java_type_info()))

        basic_double_type_info = Types.DOUBLE()
        self.assertEqual(basic_double_type_info,
                         _from_java_type(basic_double_type_info.get_java_type_info()))

        basic_char_type_info = Types.CHAR()
        self.assertEqual(basic_char_type_info,
                         _from_java_type(basic_char_type_info.get_java_type_info()))

        basic_byte_type_info = Types.BYTE()
        self.assertEqual(basic_byte_type_info,
                         _from_java_type(basic_byte_type_info.get_java_type_info()))

        basic_big_int_type_info = Types.BIG_INT()
        self.assertEqual(basic_big_int_type_info,
                         _from_java_type(basic_big_int_type_info.get_java_type_info()))

        basic_big_dec_type_info = Types.BIG_DEC()
        self.assertEqual(basic_big_dec_type_info,
                         _from_java_type(basic_big_dec_type_info.get_java_type_info()))

        basic_sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(basic_sql_date_type_info,
                         _from_java_type(basic_sql_date_type_info.get_java_type_info()))

        basic_sql_time_type_info = Types.SQL_TIME()
        self.assertEqual(basic_sql_time_type_info,
                         _from_java_type(basic_sql_time_type_info.get_java_type_info()))

        basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP()
        self.assertEqual(basic_sql_timestamp_type_info,
                         _from_java_type(basic_sql_timestamp_type_info.get_java_type_info()))

        row_type_info = Types.ROW([Types.INT(), Types.STRING()])
        self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info()))

        tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()])
        self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info()))

        primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT())
        self.assertEqual(primitive_int_array_type_info,
                         _from_java_type(primitive_int_array_type_info.get_java_type_info()))

        object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE())
        self.assertEqual(object_array_type_info,
                         _from_java_type(object_array_type_info.get_java_type_info()))

        pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY()
        self.assertEqual(pickled_byte_array_type_info,
                         _from_java_type(pickled_byte_array_type_info.get_java_type_info()))

        sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(sql_date_type_info,
                         _from_java_type(sql_date_type_info.get_java_type_info()))

        map_type_info = Types.MAP(Types.INT(), Types.STRING())
        self.assertEqual(map_type_info,
                         _from_java_type(map_type_info.get_java_type_info()))

        list_type_info = Types.LIST(Types.INT())
        self.assertEqual(list_type_info,
                         _from_java_type(list_type_info.get_java_type_info()))
Beispiel #23
0
def ds_operators():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_python_executable(
        r"D:/ProgramData/Anaconda3/envs/penter/python.exe")
    ds = s_env.from_collection(
        [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    """
    map
    flat_map
    filter
    key_by DataStream → KeyedStream
    reduce KeyedStream → DataStream
    union DataStream* → DataStream
    connect DataStream,DataStream → ConnectedStreams
    转换元组:
    project
    分区:
    partition_custom 自定义分区
    shuffle 随机分区 根据均匀分布随机划分元素。
    rebalance 轮询分区
    rescale 重新分区
    broadcast 向每个分区广播元素
    随意定制
    process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。
    其它
    start_new_chain
    disable_chaining
    slot_sharing_group
    """
    ds.rescale()
    ds.map()
    ds.flat_map()
    ds.filter()
    # KeyBy DataStream → KeyedStream
    # Reduce KeyedStream → DataStream
    ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')],
                               type_info=Types.ROW(
                                   [Types.INT(), Types.STRING()]))
    ds.key_by(lambda a: a[1]) \
        .reduce(lambda a, b: Row(a[0] + b[0], b[1]))
    # 广播
    ds.broadcast()
    # project 只有元组ds才可以
    ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]],
                               type_info=Types.TUPLE([
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT()
                               ]))
    # 输出元组的1,3索引
    ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink()

    # 存储
    ds.add_sink(
        StreamingFileSink.for_row_format(
            '/tmp/output', SimpleStringEncoder()).with_rolling_policy(
                DefaultRollingPolicy.builder().with_rollover_interval(
                    15 * 60 * 1000).with_inactivity_interval(
                        5 * 60 * 1000).with_max_part_size(1024 * 1024 *
                                                          1024).build()).
        with_output_file_config(
            OutputFileConfig.OutputFileConfigBuilder().with_part_prefix(
                "prefix").with_part_suffix("suffix").build()).build())
    s_env.execute('ds_operators')