def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build() kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def test_side_output_late_data(self): self.env.set_parallelism(1) config = Configuration(j_configuration=get_j_env_configuration( self.env._j_stream_execution_environment)) config.set_integer('python.fn-execution.bundle.size', 1) jvm = get_gateway().jvm watermark_strategy = WatermarkStrategy( jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy. forGenerator(jvm.org.apache.flink.streaming.api.functions.python. eventtime.PerElementWatermarkGenerator.getSupplier()) ).with_timestamp_assigner(SecondColumnTimestampAssigner()) tag = OutputTag('late-data', type_info=Types.ROW([Types.STRING(), Types.INT()])) ds1 = self.env.from_collection( [('a', 0), ('a', 8), ('a', 4), ('a', 6)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda e: e[0]) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .allowed_lateness(0) \ .side_output_late_data(tag) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) main_sink = DataStreamTestSinkFunction() ds2.add_sink(main_sink) side_sink = DataStreamTestSinkFunction() ds2.get_side_output(tag).add_sink(side_sink) self.env.execute('test_side_output_late_data') main_expected = ['(a,0,5,1)', '(a,5,10,2)'] self.assert_equals_sorted(main_expected, main_sink.get_results()) side_expected = ['+I[a, 4]'] self.assert_equals_sorted(side_expected, side_sink.get_results())
def _build_csv_job(self, schema): source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \ .add_sink(self.test_sink)
def _build_parquet_avro_job(self, record_schema, parquet_file_name): ds = self.env.from_source( FileSource.for_record_stream_format( AvroParquetReaders.for_generic_record(record_schema), parquet_file_name).build(), WatermarkStrategy.for_monotonous_timestamps(), "parquet-source") ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
def _build_parquet_columnar_job(self, row_type: RowType): source = FileSource.for_bulk_file_format( ParquetColumnarRowInputFormat(row_type, Configuration(), 10, True, False), self.parquet_file_name ).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source') ds.map(lambda e: e).add_sink(self.test_sink)
def test_window_all_reduce_process(self): self.env.set_parallelism(1) data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyProcessFunction(ProcessAllWindowFunction): def process(self, context: 'ProcessAllWindowFunction.Context', elements: Iterable[Tuple[str, int]]) -> Iterable[str]: yield "current window start at {}, reduce result {}".format( context.window().start, next(iter(elements)), ) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .window_all(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (a[0], a[1] + b[1]), window_function=MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_window_all_reduce_process') results = self.test_sink.get_results() expected = [ "current window start at 1, reduce result ('a', 6)", "current window start at 6, reduce result ('a', 23)", "current window start at 15, reduce result ('a', 15)" ] self.assert_equals_sorted(expected, results)
def test_window_aggregate_accumulator_type(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[int, str]: return 0, '' def add(self, value: Tuple[str, int], accumulator: Tuple[int, str]) -> Tuple[int, str]: return value[1] + accumulator[0], value[0] def get_result(self, accumulator: Tuple[str, int]): return accumulator[1], accumulator[0] def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]): return acc_a[0] + acc_b[0], acc_a[1] data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_accumulator_type') results = self.test_sink.get_results() expected = ['(a,15)', '(a,3)', '(a,6)', '(b,17)', '(b,3)'] self.assert_equals_sorted(expected, results)
def test_global_window_with_purging_trigger(self): self.env.set_parallelism(1) data_stream = self.env.from_collection( [('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyProcessFunction(ProcessWindowFunction): def clear(self, context: ProcessWindowFunction.Context) -> None: pass def process( self, key, context: ProcessWindowFunction.Context, elements: Iterable[Tuple[str, int]]) -> Iterable[tuple]: return [(key, len([e for e in elements]))] data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(GlobalWindows.create()) \ .trigger(PurgingTrigger.of(CountTrigger.of(2))) \ .process(MyProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_global_window_with_purging_trigger') results = self.test_sink.get_results() expected = ['(hi,2)', '(hi,2)', '(hi,2)'] self.assert_equals_sorted(expected, results)
def _build_parquet_columnar_job(self, row_type: RowType, parquet_file_name: str): source = FileSource.for_bulk_file_format( ParquetColumnarRowInputFormat(Configuration(), row_type, 10, True, True), parquet_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'parquet-source') ds.map(PassThroughMapFunction()).add_sink(self.test_sink)
def test_no_watermarks(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.no_watermarks( )._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy.createWatermarkGenerator(None), jvm.org. apache.flink.api.common.eventtime.NoWatermarksGenerator))
def test_for_monotonous_timestamps(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.for_monotonous_timestamps( )._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy.createWatermarkGenerator(None), jvm.org.apache.flink.api.common.eventtime. AscendingTimestampsWatermarks))
def test_with_idleness(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.no_watermarks().with_idleness( Duration.of_seconds(5))._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy, jvm.org.apache.flink.api.common. eventtime.WatermarkStrategyWithIdleness)) self.assertEqual( get_field_value(j_watermark_strategy, "idlenessTimeout").toMillis(), 5000)
def test_window_aggregate_passthrough(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[str, Dict[int, int]]: return '', {0: 0, 1: 0} def add( self, value: Tuple[str, int], accumulator: Tuple[str, Dict[int, int]] ) -> Tuple[str, Dict[int, int]]: number_map = accumulator[1] number_map[value[1] % 2] += 1 return value[0], number_map def get_result( self, accumulator: Tuple[str, Dict[int, int]]) -> Tuple[str, int]: number_map = accumulator[1] return accumulator[0], number_map[0] - number_map[1] def merge( self, acc_a: Tuple[str, Dict[int, int]], acc_b: Tuple[str, Dict[int, int]]) -> Tuple[str, Dict[int, int]]: number_map_a = acc_a[1] number_map_b = acc_b[1] new_number_map = { 0: number_map_a[0] + number_map_b[0], 1: number_map_a[1] + number_map_b[1] } return acc_a[0], new_number_map data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_passthrough') results = self.test_sink.get_results() expected = ['(a,-1)', '(a,0)', '(a,1)', '(b,-1)', '(b,0)'] self.assert_equals_sorted(expected, results)
def test_for_bounded_out_of_orderness(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(3))._j_watermark_strategy j_watermark_generator = j_watermark_strategy.createWatermarkGenerator( None) self.assertTrue( is_instance_of( j_watermark_generator, jvm.org.apache.flink.api.common. eventtime.BoundedOutOfOrdernessWatermarks)) self.assertEqual( get_field_value(j_watermark_generator, "outOfOrdernessMillis"), 3000)
def test_compiling(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() ds = self.env.from_source( source=source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='kafka source') ds.print() plan = json.loads(self.env.get_execution_plan()) self.assertEqual('Source: kafka source', plan['nodes'][0]['type'])
def test_window_aggregate_process(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[int, str]: return 0, '' def add(self, value: Tuple[str, int], accumulator: Tuple[int, str]) -> Tuple[int, str]: return value[1] + accumulator[0], value[0] def get_result(self, accumulator: Tuple[str, int]): return accumulator[1], accumulator[0] def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]): return acc_a[0] + acc_b[0], acc_a[1] class MyProcessWindowFunction(ProcessWindowFunction): def process(self, key: str, context: ProcessWindowFunction.Context, elements: Iterable[Tuple[str, int]]) -> Iterable[str]: agg_result = next(iter(elements)) yield "key {} timestamp sum {}".format(agg_result[0], agg_result[1]) def clear(self, context: ProcessWindowFunction.Context) -> None: pass data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), window_function=MyProcessWindowFunction(), accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_accumulator_type') results = self.test_sink.get_results() expected = [ 'key a timestamp sum 15', 'key a timestamp sum 3', 'key a timestamp sum 6', 'key b timestamp sum 17', 'key b timestamp sum 3' ] self.assert_equals_sorted(expected, results)
def test_timestamp_assigner_and_watermark_strategy(self): self.env.set_parallelism(1) self.env.get_config().set_auto_watermark_interval(2000) self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) data_stream = self.env.from_collection( [(1, '1603708211000'), (2, '1603708224000'), (3, '1603708226000'), (4, '1603708289000')], type_info=Types.ROW([Types.INT(), Types.STRING()])) class MyTimestampAssigner(TimestampAssigner): def extract_timestamp(self, value, record_timestamp) -> int: return int(value[1]) class MyProcessFunction(KeyedProcessFunction): def process_element(self, value, ctx, out): current_timestamp = ctx.timestamp() current_watermark = ctx.timer_service().current_watermark() current_key = ctx.get_current_key() out.collect( "current key: {}, current timestamp: {}, current watermark: {}, " "current_value: {}".format(str(current_key), str(current_timestamp), str(current_watermark), str(value))) def on_timer(self, timestamp, ctx, out): pass watermark_strategy = WatermarkStrategy.for_monotonous_timestamps()\ .with_timestamp_assigner(MyTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy)\ .key_by(lambda x: x[0], key_type_info=Types.INT()) \ .process(MyProcessFunction(), output_type=Types.STRING()).add_sink(self.test_sink) self.env.execute( 'test time stamp assigner with keyed process function') result = self.test_sink.get_results() expected_result = [ "current key: 1, current timestamp: 1603708211000, current watermark: " "9223372036854775807, current_value: <Row(1, '1603708211000')>", "current key: 2, current timestamp: 1603708224000, current watermark: " "9223372036854775807, current_value: <Row(2, '1603708224000')>", "current key: 3, current timestamp: 1603708226000, current watermark: " "9223372036854775807, current_value: <Row(3, '1603708226000')>", "current key: 4, current timestamp: 1603708289000, current watermark: " "9223372036854775807, current_value: <Row(4, '1603708289000')>" ] result.sort() expected_result.sort() self.assertEqual(expected_result, result)
def test_session_window_late_merge(self): data_stream = self.env.from_collection([ ('hi', 0), ('hi', 8), ('hi', 4)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_session_window_late_merge') results = self.test_sink.get_results() expected = ['(hi,3)'] self.assert_equals_sorted(expected, results)
def test_with_watermark_alignment(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.no_watermarks( ).with_watermark_alignment( "alignment-group-1", Duration.of_seconds(20), Duration.of_seconds(10))._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy, jvm.org.apache.flink.api.common. eventtime.WatermarksWithWatermarkAlignment)) alignment_parameters = j_watermark_strategy.getAlignmentParameters() self.assertEqual(alignment_parameters.getWatermarkGroup(), "alignment-group-1") self.assertEqual(alignment_parameters.getMaxAllowedWatermarkDrift(), 20000) self.assertEqual(alignment_parameters.getUpdateInterval(), 10000)
def test_window_reduce_passthrough(self): data_stream = self.env.from_collection([ ('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (b[0], a[1] + b[1]), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_reduce_passthrough') results = self.test_sink.get_results() expected = ['(a,3)', '(a,6)', '(a,15)', '(b,3)', '(b,17)'] self.assert_equals_sorted(expected, results)
def test_event_time_tumbling_window(self): data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_tumbling_window') results = self.test_sink.get_results() expected = ['(hi,0,5,4)', '(hi,5,10,3)', '(hi,15,20,1)'] self.assert_equals_sorted(expected, results)
def test_event_time_dynamic_gap_session_window(self): self.env.set_parallelism(1) data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 9), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_dynamic_gap_session_window') results = self.test_sink.get_results() expected = ['(hi,3)', '(hi,4)'] self.assert_equals_sorted(expected, results)
def test_event_time_session_window_with_purging_trigger(self): data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(3))) \ .trigger(PurgingTrigger.of(EventTimeTrigger.create())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_session_window_with_purging_trigger') results = self.test_sink.get_results() expected = ['(hi,1,7,4)', '(hi,8,12,2)', '(hi,15,18,1)'] self.assert_equals_sorted(expected, results)
def test_keyed_process_function_with_state(self): self.env.set_parallelism(1) self.env.get_config().set_auto_watermark_interval(2000) self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) data_stream = self.env.from_collection( [(1, 'hi', '1603708211000'), (2, 'hello', '1603708224000'), (3, 'hi', '1603708226000'), (4, 'hello', '1603708289000'), (5, 'hi', '1603708291000'), (6, 'hello', '1603708293000')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) class MyTimestampAssigner(TimestampAssigner): def extract_timestamp(self, value, record_timestamp) -> int: return int(value[2]) class MyProcessFunction(KeyedProcessFunction): def __init__(self): self.value_state = None self.list_state = None self.map_state = None def open(self, runtime_context: RuntimeContext): value_state_descriptor = ValueStateDescriptor( 'value_state', Types.INT()) self.value_state = runtime_context.get_state( value_state_descriptor) list_state_descriptor = ListStateDescriptor( 'list_state', Types.INT()) self.list_state = runtime_context.get_list_state( list_state_descriptor) map_state_descriptor = MapStateDescriptor( 'map_state', Types.INT(), Types.STRING()) self.map_state = runtime_context.get_map_state( map_state_descriptor) def process_element(self, value, ctx): current_value = self.value_state.value() self.value_state.update(value[0]) current_list = [_ for _ in self.list_state.get()] self.list_state.add(value[0]) map_entries_string = [] for k, v in self.map_state.items(): map_entries_string.append(str(k) + ': ' + str(v)) map_entries_string = '{' + ', '.join(map_entries_string) + '}' self.map_state.put(value[0], value[1]) current_key = ctx.get_current_key() yield "current key: {}, current value state: {}, current list state: {}, " \ "current map state: {}, current value: {}".format(str(current_key), str(current_value), str(current_list), map_entries_string, str(value)) def on_timer(self, timestamp, ctx): pass watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[1], key_type_info=Types.STRING()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute( 'test time stamp assigner with keyed process function') result = self.test_sink.get_results() expected_result = [ "current key: hi, current value state: None, current list state: [], " "current map state: {}, current value: Row(f0=1, f1='hi', " "f2='1603708211000')", "current key: hello, current value state: None, " "current list state: [], current map state: {}, current value: Row(f0=2," " f1='hello', f2='1603708224000')", "current key: hi, current value state: 1, current list state: [1], " "current map state: {1: hi}, current value: Row(f0=3, f1='hi', " "f2='1603708226000')", "current key: hello, current value state: 2, current list state: [2], " "current map state: {2: hello}, current value: Row(f0=4, f1='hello', " "f2='1603708289000')", "current key: hi, current value state: 3, current list state: [1, 3], " "current map state: {1: hi, 3: hi}, current value: Row(f0=5, f1='hi', " "f2='1603708291000')", "current key: hello, current value state: 4, current list state: [2, 4]," " current map state: {2: hello, 4: hello}, current value: Row(f0=6, " "f1='hello', f2='1603708293000')" ] result.sort() expected_result.sort() self.assertEqual(expected_result, result)