Example #1
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'],
                                [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(),
                                 Types.INT()])
    json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build()
    kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'}

    kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props)
    kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props)

    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\
        .with_timestamp_assigner(KafkaRowTimestampAssigner())

    kafka_consumer.set_start_from_earliest()
    ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy)
    ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
Example #2
0
    def test_side_output_late_data(self):
        self.env.set_parallelism(1)
        config = Configuration(j_configuration=get_j_env_configuration(
            self.env._j_stream_execution_environment))
        config.set_integer('python.fn-execution.bundle.size', 1)
        jvm = get_gateway().jvm
        watermark_strategy = WatermarkStrategy(
            jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy.
            forGenerator(jvm.org.apache.flink.streaming.api.functions.python.
                         eventtime.PerElementWatermarkGenerator.getSupplier())
        ).with_timestamp_assigner(SecondColumnTimestampAssigner())

        tag = OutputTag('late-data',
                        type_info=Types.ROW([Types.STRING(),
                                             Types.INT()]))
        ds1 = self.env.from_collection(
            [('a', 0), ('a', 8), ('a', 4), ('a', 6)],
            type_info=Types.ROW([Types.STRING(), Types.INT()]))
        ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda e: e[0]) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .allowed_lateness(0) \
            .side_output_late_data(tag) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()]))
        main_sink = DataStreamTestSinkFunction()
        ds2.add_sink(main_sink)
        side_sink = DataStreamTestSinkFunction()
        ds2.get_side_output(tag).add_sink(side_sink)

        self.env.execute('test_side_output_late_data')
        main_expected = ['(a,0,5,1)', '(a,5,10,2)']
        self.assert_equals_sorted(main_expected, main_sink.get_results())
        side_expected = ['+I[a, 4]']
        self.assert_equals_sorted(side_expected, side_sink.get_results())
Example #3
0
 def _build_csv_job(self, schema):
     source = FileSource.for_record_stream_format(
         CsvReaderFormat.for_schema(schema), self.csv_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'csv-source')
     ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \
         .add_sink(self.test_sink)
Example #4
0
 def _build_parquet_avro_job(self, record_schema, parquet_file_name):
     ds = self.env.from_source(
         FileSource.for_record_stream_format(
             AvroParquetReaders.for_generic_record(record_schema),
             parquet_file_name).build(),
         WatermarkStrategy.for_monotonous_timestamps(), "parquet-source")
     ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
Example #5
0
 def _build_parquet_columnar_job(self, row_type: RowType):
     source = FileSource.for_bulk_file_format(
         ParquetColumnarRowInputFormat(row_type, Configuration(), 10, True, False),
         self.parquet_file_name
     ).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source')
     ds.map(lambda e: e).add_sink(self.test_sink)
Example #6
0
    def test_window_all_reduce_process(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyProcessFunction(ProcessAllWindowFunction):
            def process(self, context: 'ProcessAllWindowFunction.Context',
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                yield "current window start at {}, reduce result {}".format(
                    context.window().start,
                    next(iter(elements)),
                )

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .window_all(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (a[0], a[1] + b[1]),
                    window_function=MyProcessFunction(),
                    output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_window_all_reduce_process')
        results = self.test_sink.get_results()
        expected = [
            "current window start at 1, reduce result ('a', 6)",
            "current window start at 6, reduce result ('a', 23)",
            "current window start at 15, reduce result ('a', 15)"
        ]
        self.assert_equals_sorted(expected, results)
Example #7
0
    def test_window_aggregate_accumulator_type(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[int, str]:
                return 0, ''

            def add(self, value: Tuple[str, int],
                    accumulator: Tuple[int, str]) -> Tuple[int, str]:
                return value[1] + accumulator[0], value[0]

            def get_result(self, accumulator: Tuple[str, int]):
                return accumulator[1], accumulator[0]

            def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]):
                return acc_a[0] + acc_b[0], acc_a[1]

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]),
                       output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_accumulator_type')
        results = self.test_sink.get_results()
        expected = ['(a,15)', '(a,3)', '(a,6)', '(b,17)', '(b,3)']
        self.assert_equals_sorted(expected, results)
Example #8
0
    def test_global_window_with_purging_trigger(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection(
            [('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1),
             ('hi', 1)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyProcessFunction(ProcessWindowFunction):
            def clear(self, context: ProcessWindowFunction.Context) -> None:
                pass

            def process(
                    self, key, context: ProcessWindowFunction.Context,
                    elements: Iterable[Tuple[str, int]]) -> Iterable[tuple]:
                return [(key, len([e for e in elements]))]

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(GlobalWindows.create()) \
            .trigger(PurgingTrigger.of(CountTrigger.of(2))) \
            .process(MyProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_global_window_with_purging_trigger')
        results = self.test_sink.get_results()
        expected = ['(hi,2)', '(hi,2)', '(hi,2)']
        self.assert_equals_sorted(expected, results)
Example #9
0
 def _build_parquet_columnar_job(self, row_type: RowType,
                                 parquet_file_name: str):
     source = FileSource.for_bulk_file_format(
         ParquetColumnarRowInputFormat(Configuration(), row_type, 10, True,
                                       True), parquet_file_name).build()
     ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(),
                               'parquet-source')
     ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
Example #10
0
 def test_no_watermarks(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.no_watermarks(
     )._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy.createWatermarkGenerator(None), jvm.org.
             apache.flink.api.common.eventtime.NoWatermarksGenerator))
Example #11
0
 def test_for_monotonous_timestamps(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.for_monotonous_timestamps(
     )._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy.createWatermarkGenerator(None),
             jvm.org.apache.flink.api.common.eventtime.
             AscendingTimestampsWatermarks))
Example #12
0
 def test_with_idleness(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.no_watermarks().with_idleness(
         Duration.of_seconds(5))._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy, jvm.org.apache.flink.api.common.
             eventtime.WatermarkStrategyWithIdleness))
     self.assertEqual(
         get_field_value(j_watermark_strategy,
                         "idlenessTimeout").toMillis(), 5000)
Example #13
0
    def test_window_aggregate_passthrough(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[str, Dict[int, int]]:
                return '', {0: 0, 1: 0}

            def add(
                self, value: Tuple[str, int], accumulator: Tuple[str,
                                                                 Dict[int,
                                                                      int]]
            ) -> Tuple[str, Dict[int, int]]:
                number_map = accumulator[1]
                number_map[value[1] % 2] += 1
                return value[0], number_map

            def get_result(
                    self, accumulator: Tuple[str,
                                             Dict[int,
                                                  int]]) -> Tuple[str, int]:
                number_map = accumulator[1]
                return accumulator[0], number_map[0] - number_map[1]

            def merge(
                self, acc_a: Tuple[str, Dict[int, int]],
                acc_b: Tuple[str, Dict[int,
                                       int]]) -> Tuple[str, Dict[int, int]]:
                number_map_a = acc_a[1]
                number_map_b = acc_b[1]
                new_number_map = {
                    0: number_map_a[0] + number_map_b[0],
                    1: number_map_a[1] + number_map_b[1]
                }
                return acc_a[0], new_number_map

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_passthrough')
        results = self.test_sink.get_results()
        expected = ['(a,-1)', '(a,0)', '(a,1)', '(b,-1)', '(b,0)']
        self.assert_equals_sorted(expected, results)
Example #14
0
 def test_for_bounded_out_of_orderness(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
         Duration.of_seconds(3))._j_watermark_strategy
     j_watermark_generator = j_watermark_strategy.createWatermarkGenerator(
         None)
     self.assertTrue(
         is_instance_of(
             j_watermark_generator, jvm.org.apache.flink.api.common.
             eventtime.BoundedOutOfOrdernessWatermarks))
     self.assertEqual(
         get_field_value(j_watermark_generator, "outOfOrdernessMillis"),
         3000)
Example #15
0
    def test_compiling(self):
        source = KafkaSource.builder() \
            .set_bootstrap_servers('localhost:9092') \
            .set_topics('test_topic') \
            .set_value_only_deserializer(SimpleStringSchema()) \
            .build()

        ds = self.env.from_source(
            source=source,
            watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
            source_name='kafka source')
        ds.print()
        plan = json.loads(self.env.get_execution_plan())
        self.assertEqual('Source: kafka source', plan['nodes'][0]['type'])
Example #16
0
    def test_window_aggregate_process(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyAggregateFunction(AggregateFunction):
            def create_accumulator(self) -> Tuple[int, str]:
                return 0, ''

            def add(self, value: Tuple[str, int],
                    accumulator: Tuple[int, str]) -> Tuple[int, str]:
                return value[1] + accumulator[0], value[0]

            def get_result(self, accumulator: Tuple[str, int]):
                return accumulator[1], accumulator[0]

            def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]):
                return acc_a[0] + acc_b[0], acc_a[1]

        class MyProcessWindowFunction(ProcessWindowFunction):
            def process(self, key: str, context: ProcessWindowFunction.Context,
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                agg_result = next(iter(elements))
                yield "key {} timestamp sum {}".format(agg_result[0],
                                                       agg_result[1])

            def clear(self, context: ProcessWindowFunction.Context) -> None:
                pass

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .aggregate(MyAggregateFunction(),
                       window_function=MyProcessWindowFunction(),
                       accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]),
                       output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_aggregate_accumulator_type')
        results = self.test_sink.get_results()
        expected = [
            'key a timestamp sum 15', 'key a timestamp sum 3',
            'key a timestamp sum 6', 'key b timestamp sum 17',
            'key b timestamp sum 3'
        ]
        self.assert_equals_sorted(expected, results)
Example #17
0
    def test_timestamp_assigner_and_watermark_strategy(self):
        self.env.set_parallelism(1)
        self.env.get_config().set_auto_watermark_interval(2000)
        self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
        data_stream = self.env.from_collection(
            [(1, '1603708211000'), (2, '1603708224000'), (3, '1603708226000'),
             (4, '1603708289000')],
            type_info=Types.ROW([Types.INT(), Types.STRING()]))

        class MyTimestampAssigner(TimestampAssigner):
            def extract_timestamp(self, value, record_timestamp) -> int:
                return int(value[1])

        class MyProcessFunction(KeyedProcessFunction):
            def process_element(self, value, ctx, out):
                current_timestamp = ctx.timestamp()
                current_watermark = ctx.timer_service().current_watermark()
                current_key = ctx.get_current_key()
                out.collect(
                    "current key: {}, current timestamp: {}, current watermark: {}, "
                    "current_value: {}".format(str(current_key),
                                               str(current_timestamp),
                                               str(current_watermark),
                                               str(value)))

            def on_timer(self, timestamp, ctx, out):
                pass

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps()\
            .with_timestamp_assigner(MyTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy)\
            .key_by(lambda x: x[0], key_type_info=Types.INT()) \
            .process(MyProcessFunction(), output_type=Types.STRING()).add_sink(self.test_sink)
        self.env.execute(
            'test time stamp assigner with keyed process function')
        result = self.test_sink.get_results()
        expected_result = [
            "current key: 1, current timestamp: 1603708211000, current watermark: "
            "9223372036854775807, current_value: <Row(1, '1603708211000')>",
            "current key: 2, current timestamp: 1603708224000, current watermark: "
            "9223372036854775807, current_value: <Row(2, '1603708224000')>",
            "current key: 3, current timestamp: 1603708226000, current watermark: "
            "9223372036854775807, current_value: <Row(3, '1603708226000')>",
            "current key: 4, current timestamp: 1603708289000, current watermark: "
            "9223372036854775807, current_value: <Row(4, '1603708289000')>"
        ]
        result.sort()
        expected_result.sort()
        self.assertEqual(expected_result, result)
Example #18
0
    def test_session_window_late_merge(self):
        data_stream = self.env.from_collection([
            ('hi', 0), ('hi', 8), ('hi', 4)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_session_window_late_merge')
        results = self.test_sink.get_results()
        expected = ['(hi,3)']
        self.assert_equals_sorted(expected, results)
Example #19
0
 def test_with_watermark_alignment(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.no_watermarks(
     ).with_watermark_alignment(
         "alignment-group-1", Duration.of_seconds(20),
         Duration.of_seconds(10))._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy, jvm.org.apache.flink.api.common.
             eventtime.WatermarksWithWatermarkAlignment))
     alignment_parameters = j_watermark_strategy.getAlignmentParameters()
     self.assertEqual(alignment_parameters.getWatermarkGroup(),
                      "alignment-group-1")
     self.assertEqual(alignment_parameters.getMaxAllowedWatermarkDrift(),
                      20000)
     self.assertEqual(alignment_parameters.getUpdateInterval(), 10000)
Example #20
0
    def test_window_reduce_passthrough(self):
        data_stream = self.env.from_collection([
            ('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (b[0], a[1] + b[1]),
                    output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_reduce_passthrough')
        results = self.test_sink.get_results()
        expected = ['(a,3)', '(a,6)', '(a,15)', '(b,3)', '(b,17)']
        self.assert_equals_sorted(expected, results)
Example #21
0
    def test_event_time_tumbling_window(self):
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9),
            ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_tumbling_window')
        results = self.test_sink.get_results()
        expected = ['(hi,0,5,4)', '(hi,5,10,3)', '(hi,15,20,1)']
        self.assert_equals_sorted(expected, results)
Example #22
0
    def test_event_time_dynamic_gap_session_window(self):
        self.env.set_parallelism(1)
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 9), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \
            .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_dynamic_gap_session_window')
        results = self.test_sink.get_results()
        expected = ['(hi,3)', '(hi,4)']
        self.assert_equals_sorted(expected, results)
Example #23
0
    def test_event_time_session_window_with_purging_trigger(self):
        data_stream = self.env.from_collection([
            ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(), Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(3))) \
            .trigger(PurgingTrigger.of(EventTimeTrigger.create())) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_session_window_with_purging_trigger')
        results = self.test_sink.get_results()
        expected = ['(hi,1,7,4)', '(hi,8,12,2)', '(hi,15,18,1)']
        self.assert_equals_sorted(expected, results)
Example #24
0
    def test_keyed_process_function_with_state(self):
        self.env.set_parallelism(1)
        self.env.get_config().set_auto_watermark_interval(2000)
        self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
        data_stream = self.env.from_collection(
            [(1, 'hi', '1603708211000'), (2, 'hello', '1603708224000'),
             (3, 'hi', '1603708226000'), (4, 'hello', '1603708289000'),
             (5, 'hi', '1603708291000'), (6, 'hello', '1603708293000')],
            type_info=Types.ROW([Types.INT(),
                                 Types.STRING(),
                                 Types.STRING()]))

        class MyTimestampAssigner(TimestampAssigner):
            def extract_timestamp(self, value, record_timestamp) -> int:
                return int(value[2])

        class MyProcessFunction(KeyedProcessFunction):
            def __init__(self):
                self.value_state = None
                self.list_state = None
                self.map_state = None

            def open(self, runtime_context: RuntimeContext):
                value_state_descriptor = ValueStateDescriptor(
                    'value_state', Types.INT())
                self.value_state = runtime_context.get_state(
                    value_state_descriptor)
                list_state_descriptor = ListStateDescriptor(
                    'list_state', Types.INT())
                self.list_state = runtime_context.get_list_state(
                    list_state_descriptor)
                map_state_descriptor = MapStateDescriptor(
                    'map_state', Types.INT(), Types.STRING())
                self.map_state = runtime_context.get_map_state(
                    map_state_descriptor)

            def process_element(self, value, ctx):
                current_value = self.value_state.value()
                self.value_state.update(value[0])
                current_list = [_ for _ in self.list_state.get()]
                self.list_state.add(value[0])
                map_entries_string = []
                for k, v in self.map_state.items():
                    map_entries_string.append(str(k) + ': ' + str(v))
                map_entries_string = '{' + ', '.join(map_entries_string) + '}'
                self.map_state.put(value[0], value[1])
                current_key = ctx.get_current_key()
                yield "current key: {}, current value state: {}, current list state: {}, " \
                      "current map state: {}, current value: {}".format(str(current_key),
                                                                        str(current_value),
                                                                        str(current_list),
                                                                        map_entries_string,
                                                                        str(value))

            def on_timer(self, timestamp, ctx):
                pass

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(MyTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[1], key_type_info=Types.STRING()) \
            .process(MyProcessFunction(), output_type=Types.STRING()) \
            .add_sink(self.test_sink)
        self.env.execute(
            'test time stamp assigner with keyed process function')
        result = self.test_sink.get_results()
        expected_result = [
            "current key: hi, current value state: None, current list state: [], "
            "current map state: {}, current value: Row(f0=1, f1='hi', "
            "f2='1603708211000')",
            "current key: hello, current value state: None, "
            "current list state: [], current map state: {}, current value: Row(f0=2,"
            " f1='hello', f2='1603708224000')",
            "current key: hi, current value state: 1, current list state: [1], "
            "current map state: {1: hi}, current value: Row(f0=3, f1='hi', "
            "f2='1603708226000')",
            "current key: hello, current value state: 2, current list state: [2], "
            "current map state: {2: hello}, current value: Row(f0=4, f1='hello', "
            "f2='1603708289000')",
            "current key: hi, current value state: 3, current list state: [1, 3], "
            "current map state: {1: hi, 3: hi}, current value: Row(f0=5, f1='hi', "
            "f2='1603708291000')",
            "current key: hello, current value state: 4, current list state: [2, 4],"
            " current map state: {2: hello, 4: hello}, current value: Row(f0=6, "
            "f1='hello', f2='1603708293000')"
        ]
        result.sort()
        expected_result.sort()
        self.assertEqual(expected_result, result)