Example #1
0
    def test_window_all_reduce_process(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyProcessFunction(ProcessAllWindowFunction):
            def process(self, context: 'ProcessAllWindowFunction.Context',
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                yield "current window start at {}, reduce result {}".format(
                    context.window().start,
                    next(iter(elements)),
                )

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .window_all(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (a[0], a[1] + b[1]),
                    window_function=MyProcessFunction(),
                    output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_window_all_reduce_process')
        results = self.test_sink.get_results()
        expected = [
            "current window start at 1, reduce result ('a', 6)",
            "current window start at 6, reduce result ('a', 23)",
            "current window start at 15, reduce result ('a', 15)"
        ]
        self.assert_equals_sorted(expected, results)
Example #2
0
    def test_window_aggregate_accumulator_type(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[int, str]:
                return 0, ''

            def add(self, value: Tuple[str, int],
                    accumulator: Tuple[int, str]) -> Tuple[int, str]:
                return value[1] + accumulator[0], value[0]

            def get_result(self, accumulator: Tuple[str, int]):
                return accumulator[1], accumulator[0]

            def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]):
                return acc_a[0] + acc_b[0], acc_a[1]

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]),
                       output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_accumulator_type')
        results = self.test_sink.get_results()
        expected = ['(a,15)', '(a,3)', '(a,6)', '(b,17)', '(b,3)']
        self.assert_equals_sorted(expected, results)
Example #3
0
    def test_window_aggregate_passthrough(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[str, Dict[int, int]]:
                return '', {0: 0, 1: 0}

            def add(
                self, value: Tuple[str, int], accumulator: Tuple[str,
                                                                 Dict[int,
                                                                      int]]
            ) -> Tuple[str, Dict[int, int]]:
                number_map = accumulator[1]
                number_map[value[1] % 2] += 1
                return value[0], number_map

            def get_result(
                    self, accumulator: Tuple[str,
                                             Dict[int,
                                                  int]]) -> Tuple[str, int]:
                number_map = accumulator[1]
                return accumulator[0], number_map[0] - number_map[1]

            def merge(
                self, acc_a: Tuple[str, Dict[int, int]],
                acc_b: Tuple[str, Dict[int,
                                       int]]) -> Tuple[str, Dict[int, int]]:
                number_map_a = acc_a[1]
                number_map_b = acc_b[1]
                new_number_map = {
                    0: number_map_a[0] + number_map_b[0],
                    1: number_map_a[1] + number_map_b[1]
                }
                return acc_a[0], new_number_map

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_passthrough')
        results = self.test_sink.get_results()
        expected = ['(a,-1)', '(a,0)', '(a,1)', '(b,-1)', '(b,0)']
        self.assert_equals_sorted(expected, results)
Example #4
0
    def test_window_aggregate_process(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[int, str]:
                return 0, ''

            def add(self, value: Tuple[str, int],
                    accumulator: Tuple[int, str]) -> Tuple[int, str]:
                return value[1] + accumulator[0], value[0]

            def get_result(self, accumulator: Tuple[str, int]):
                return accumulator[1], accumulator[0]

            def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]):
                return acc_a[0] + acc_b[0], acc_a[1]

        class MyProcessWindowFunction(ProcessWindowFunction):
            def process(self, key: str, context: ProcessWindowFunction.Context,
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                agg_result = next(iter(elements))
                yield "key {} timestamp sum {}".format(agg_result[0],
                                                       agg_result[1])

            def clear(self, context: ProcessWindowFunction.Context) -> None:
                pass

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       window_function=MyProcessWindowFunction(),
                       accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]),
                       output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_accumulator_type')
        results = self.test_sink.get_results()
        expected = [
            'key a timestamp sum 15', 'key a timestamp sum 3',
            'key a timestamp sum 6', 'key b timestamp sum 17',
            'key b timestamp sum 3'
        ]
        self.assert_equals_sorted(expected, results)
Example #5
0
    def test_session_window_late_merge(self):
        data_stream = self.env.from_collection([
            ('hi', 0), ('hi', 8), ('hi', 4)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_session_window_late_merge')
        results = self.test_sink.get_results()
        expected = ['(hi,3)']
        self.assert_equals_sorted(expected, results)
Example #6
0
    def test_window_reduce_passthrough(self):
        data_stream = self.env.from_collection([
            ('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (b[0], a[1] + b[1]),
                    output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_reduce_passthrough')
        results = self.test_sink.get_results()
        expected = ['(a,3)', '(a,6)', '(a,15)', '(b,3)', '(b,17)']
        self.assert_equals_sorted(expected, results)
Example #7
0
    def test_event_time_dynamic_gap_session_window(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 9), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_dynamic_gap_session_window')
        results = self.test_sink.get_results()
        expected = ['(hi,3)', '(hi,4)']
        self.assert_equals_sorted(expected, results)
Example #8
0
    def test_event_time_session_window_with_purging_trigger(self):
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(3))) \
            .trigger(PurgingTrigger.of(EventTimeTrigger.create())) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_session_window_with_purging_trigger')
        results = self.test_sink.get_results()
        expected = ['(hi,1,7,4)', '(hi,8,12,2)', '(hi,15,18,1)']
        self.assert_equals_sorted(expected, results)
Example #9
0
    # define the source
    data_stream = env.from_collection([('hi', 1), ('hi', 2), ('hi', 3),
                                       ('hi', 4), ('hi', 8), ('hi', 9),
                                       ('hi', 15)],
                                      type_info=Types.TUPLE(
                                          [Types.STRING(),
                                           Types.INT()]))

    # define the watermark strategy
    watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
        .with_timestamp_assigner(MyTimestampAssigner())

    ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
        .key_by(lambda x: x[0], key_type=Types.STRING()) \
        .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \
        .process(CountWindowProcessFunction(),
                 Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()]))

    # define the sink
    if output_path is not None:
        ds.sink_to(sink=FileSink.for_row_format(
            base_path=output_path, encoder=Encoder.simple_string_encoder()
        ).with_output_file_config(OutputFileConfig.builder().with_part_prefix(
            "prefix").with_part_suffix(".ext").build()).with_rolling_policy(
                RollingPolicy.default_rolling_policy()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        ds.print()
Example #10
0
    env = StreamExecutionEnvironment.get_execution_environment()
    # write all the data to one file
    env.set_parallelism(1)

    # define the source
    data_stream = env.from_collection([
        ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)],
        type_info=Types.TUPLE([Types.STRING(), Types.INT()]))

    # define the watermark strategy
    watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
        .with_timestamp_assigner(MyTimestampAssigner())

    ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
        .key_by(lambda x: x[0], key_type=Types.STRING()) \
        .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \
        .process(CountWindowProcessFunction(),
                 Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()]))

    # define the sink
    if output_path is not None:
        ds.sink_to(
            sink=FileSink.for_row_format(
                base_path=output_path,
                encoder=Encoder.simple_string_encoder())
            .with_output_file_config(
                OutputFileConfig.builder()
                .with_part_prefix("prefix")
                .with_part_suffix(".ext")
                .build())
            .with_rolling_policy(RollingPolicy.default_rolling_policy())