def test_window_aggregate_accumulator_type(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[int, str]: return 0, '' def add(self, value: Tuple[str, int], accumulator: Tuple[int, str]) -> Tuple[int, str]: return value[1] + accumulator[0], value[0] def get_result(self, accumulator: Tuple[str, int]): return accumulator[1], accumulator[0] def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]): return acc_a[0] + acc_b[0], acc_a[1] data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_accumulator_type') results = self.test_sink.get_results() expected = ['(a,15)', '(a,3)', '(a,6)', '(b,17)', '(b,3)'] self.assert_equals_sorted(expected, results)
def test_reducing_state(self): self.env.set_parallelism(2) data_stream = self.env.from_collection( [(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello')], type_info=Types.TUPLE([Types.INT(), Types.STRING()])) class MyProcessFunction(KeyedProcessFunction): def __init__(self): self.reducing_state = None # type: ReducingState def open(self, runtime_context: RuntimeContext): self.reducing_state = runtime_context.get_reducing_state( ReducingStateDescriptor('reducing_state', lambda i, i2: i + i2, Types.INT())) def process_element(self, value, ctx): self.reducing_state.add(value[0]) yield Row(self.reducing_state.get(), value[1]) data_stream.key_by(lambda x: x[1], key_type_info=Types.STRING()) \ .process(MyProcessFunction(), output_type=Types.TUPLE([Types.INT(), Types.STRING()])) \ .add_sink(self.test_sink) self.env.execute('test_reducing_state') result = self.test_sink.get_results() expected_result = [ '(1,hi)', '(2,hello)', '(4,hi)', '(6,hello)', '(9,hi)', '(12,hello)' ] result.sort() expected_result.sort() self.assertEqual(expected_result, result)
def test_global_window_with_purging_trigger(self): self.env.set_parallelism(1) data_stream = self.env.from_collection( [('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1), ('hi', 1)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyProcessFunction(ProcessWindowFunction): def clear(self, context: ProcessWindowFunction.Context) -> None: pass def process( self, key, context: ProcessWindowFunction.Context, elements: Iterable[Tuple[str, int]]) -> Iterable[tuple]: return [(key, len([e for e in elements]))] data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(GlobalWindows.create()) \ .trigger(PurgingTrigger.of(CountTrigger.of(2))) \ .process(MyProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_global_window_with_purging_trigger') results = self.test_sink.get_results() expected = ['(hi,2)', '(hi,2)', '(hi,2)'] self.assert_equals_sorted(expected, results)
def test_window_aggregate_passthrough(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[str, Dict[int, int]]: return '', {0: 0, 1: 0} def add( self, value: Tuple[str, int], accumulator: Tuple[str, Dict[int, int]] ) -> Tuple[str, Dict[int, int]]: number_map = accumulator[1] number_map[value[1] % 2] += 1 return value[0], number_map def get_result( self, accumulator: Tuple[str, Dict[int, int]]) -> Tuple[str, int]: number_map = accumulator[1] return accumulator[0], number_map[0] - number_map[1] def merge( self, acc_a: Tuple[str, Dict[int, int]], acc_b: Tuple[str, Dict[int, int]]) -> Tuple[str, Dict[int, int]]: number_map_a = acc_a[1] number_map_b = acc_b[1] new_number_map = { 0: number_map_a[0] + number_map_b[0], 1: number_map_a[1] + number_map_b[1] } return acc_a[0], new_number_map data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_passthrough') results = self.test_sink.get_results() expected = ['(a,-1)', '(a,0)', '(a,1)', '(b,-1)', '(b,0)'] self.assert_equals_sorted(expected, results)
def test_count_sliding_window(self): data_stream = self.env.from_collection([ (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello')], type_info=Types.TUPLE([Types.INT(), Types.STRING()])) # type: DataStream data_stream.key_by(lambda x: x[1], key_type=Types.STRING()) \ .window(CountSlidingWindowAssigner(2, 1)) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_count_sliding_window') results = self.test_sink.get_results() expected = ['(hello,6)', '(hi,8)', '(hi,4)', '(hello,10)'] self.assert_equals_sorted(expected, results)
def test_window_aggregate_process(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyAggregateFunction(AggregateFunction): def create_accumulator(self) -> Tuple[int, str]: return 0, '' def add(self, value: Tuple[str, int], accumulator: Tuple[int, str]) -> Tuple[int, str]: return value[1] + accumulator[0], value[0] def get_result(self, accumulator: Tuple[str, int]): return accumulator[1], accumulator[0] def merge(self, acc_a: Tuple[int, str], acc_b: Tuple[int, str]): return acc_a[0] + acc_b[0], acc_a[1] class MyProcessWindowFunction(ProcessWindowFunction): def process(self, key: str, context: ProcessWindowFunction.Context, elements: Iterable[Tuple[str, int]]) -> Iterable[str]: agg_result = next(iter(elements)) yield "key {} timestamp sum {}".format(agg_result[0], agg_result[1]) def clear(self, context: ProcessWindowFunction.Context) -> None: pass data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .aggregate(MyAggregateFunction(), window_function=MyProcessWindowFunction(), accumulator_type=Types.TUPLE([Types.INT(), Types.STRING()]), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_time_window_aggregate_accumulator_type') results = self.test_sink.get_results() expected = [ 'key a timestamp sum 15', 'key a timestamp sum 3', 'key a timestamp sum 6', 'key b timestamp sum 17', 'key b timestamp sum 3' ] self.assert_equals_sorted(expected, results)
def test_tuple_type(self): self.assertEqual(TupleTypeInfo([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.INT()]), True) self.assertEqual(TupleTypeInfo([Types.STRING(), Types.INT()]).__str__(), "TupleTypeInfo(String, Integer)") self.assertNotEqual(TupleTypeInfo([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.BOOLEAN()])) self.assertEqual(Types.TUPLE([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.INT()])) self.assertEqual(Types.TUPLE([Types.STRING(), Types.INT()]).get_field_types(), [Types.STRING(), Types.INT()])
def test_side_output_late_data(self): self.env.set_parallelism(1) config = Configuration(j_configuration=get_j_env_configuration( self.env._j_stream_execution_environment)) config.set_integer('python.fn-execution.bundle.size', 1) jvm = get_gateway().jvm watermark_strategy = WatermarkStrategy( jvm.org.apache.flink.api.common.eventtime.WatermarkStrategy. forGenerator(jvm.org.apache.flink.streaming.api.functions.python. eventtime.PerElementWatermarkGenerator.getSupplier()) ).with_timestamp_assigner(SecondColumnTimestampAssigner()) tag = OutputTag('late-data', type_info=Types.ROW([Types.STRING(), Types.INT()])) ds1 = self.env.from_collection( [('a', 0), ('a', 8), ('a', 4), ('a', 6)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds2 = ds1.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda e: e[0]) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .allowed_lateness(0) \ .side_output_late_data(tag) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) main_sink = DataStreamTestSinkFunction() ds2.add_sink(main_sink) side_sink = DataStreamTestSinkFunction() ds2.get_side_output(tag).add_sink(side_sink) self.env.execute('test_side_output_late_data') main_expected = ['(a,0,5,1)', '(a,5,10,2)'] self.assert_equals_sorted(main_expected, main_sink.get_results()) side_expected = ['+I[a, 4]'] self.assert_equals_sorted(side_expected, side_sink.get_results())
def event_timer_timer_demo(): env = StreamExecutionEnvironment.get_execution_environment() ds = env.from_collection( collection=[ (1000, 'Alice', 110.1), (4000, 'Bob', 30.2), (3000, 'Alice', 20.0), (2000, 'Bob', 53.1), (5000, 'Alice', 13.1), (3000, 'Bob', 3.1), (7000, 'Bob', 16.1), (10000, 'Alice', 20.1) ], type_info=Types.TUPLE([Types.LONG(), Types.STRING(), Types.FLOAT()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(2)) .with_timestamp_assigner(MyTimestampAssigner())) # apply the process function onto a keyed stream ds.key_by(lambda value: value[1]) \ .process(Sum()) \ .print() # submit for execution env.execute()
def test_window_all_reduce_process(self): self.env.set_parallelism(1) data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyProcessFunction(ProcessAllWindowFunction): def process(self, context: 'ProcessAllWindowFunction.Context', elements: Iterable[Tuple[str, int]]) -> Iterable[str]: yield "current window start at {}, reduce result {}".format( context.window().start, next(iter(elements)), ) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .window_all(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (a[0], a[1] + b[1]), window_function=MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_window_all_reduce_process') results = self.test_sink.get_results() expected = [ "current window start at 1, reduce result ('a', 6)", "current window start at 6, reduce result ('a', 23)", "current window start at 15, reduce result ('a', 15)" ] self.assert_equals_sorted(expected, results)
def test_project(self): ds = self.env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]], type_info=Types.TUPLE( [Types.INT(), Types.INT(), Types.INT(), Types.INT()])) ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink(self.test_sink) exec_plan = eval(self.env.get_execution_plan()) self.assertEqual(exec_plan['nodes'][1]['type'], 'Projection')
def test_session_window_late_merge(self): data_stream = self.env.from_collection([ ('hi', 0), ('hi', 8), ('hi', 4)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_session_window_late_merge') results = self.test_sink.get_results() expected = ['(hi,3)'] self.assert_equals_sorted(expected, results)
def test_window_reduce_passthrough(self): data_stream = self.env.from_collection([ ('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (b[0], a[1] + b[1]), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_time_window_reduce_passthrough') results = self.test_sink.get_results() expected = ['(a,3)', '(a,6)', '(a,15)', '(b,3)', '(b,17)'] self.assert_equals_sorted(expected, results)
def test_aggregating_state(self): self.env.set_parallelism(2) data_stream = self.env.from_collection( [(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello')], type_info=Types.TUPLE([Types.INT(), Types.STRING()])) class MyAggregateFunction(AggregateFunction): def create_accumulator(self): return 0 def add(self, value, accumulator): return value + accumulator def get_result(self, accumulator): return accumulator def merge(self, acc_a, acc_b): return acc_a + acc_b class MyProcessFunction(KeyedProcessFunction): def __init__(self): self.aggregating_state = None # type: AggregatingState def open(self, runtime_context: RuntimeContext): self.aggregating_state = runtime_context.get_aggregating_state( AggregatingStateDescriptor('aggregating_state', MyAggregateFunction(), Types.INT())) def process_element(self, value, ctx): self.aggregating_state.add(value[0]) yield Row(self.aggregating_state.get(), value[1]) data_stream.key_by(lambda x: x[1], key_type_info=Types.STRING()) \ .process(MyProcessFunction(), output_type=Types.TUPLE([Types.INT(), Types.STRING()])) \ .add_sink(self.test_sink) self.env.execute('test_aggregating_state') result = self.test_sink.get_results() expected_result = [ '(1,hi)', '(2,hello)', '(4,hi)', '(6,hello)', '(9,hi)', '(12,hello)' ] result.sort() expected_result.sort() self.assertEqual(expected_result, result)
def test_event_time_tumbling_window(self): data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_tumbling_window') results = self.test_sink.get_results() expected = ['(hi,0,5,4)', '(hi,5,10,3)', '(hi,15,20,1)'] self.assert_equals_sorted(expected, results)
def test_event_time_dynamic_gap_session_window(self): self.env.set_parallelism(1) data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 9), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_dynamic_gap_session_window') results = self.test_sink.get_results() expected = ['(hi,3)', '(hi,4)'] self.assert_equals_sorted(expected, results)
def test_event_time_session_window_with_purging_trigger(self): data_stream = self.env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(3))) \ .trigger(PurgingTrigger.of(EventTimeTrigger.create())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_session_window_with_purging_trigger') results = self.test_sink.get_results() expected = ['(hi,1,7,4)', '(hi,8,12,2)', '(hi,15,18,1)'] self.assert_equals_sorted(expected, results)
def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps() .with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream(ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) self.assertEqual("""( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view("t", ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) result = self.t_env.execute_sql("SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [item for item in map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_from_and_to_changelog_stream_event_time(self): from pyflink.table import Schema self.env.set_parallelism(1) ds = self.env.from_collection( [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW([Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner())) changelog_stream = ds.map(lambda t: Row(t.f1, t.f2), Types.ROW([Types.INT(), Types.STRING()])) # derive physical columns and add a rowtime table = self.t_env.from_changelog_stream( changelog_stream, Schema.new_builder().column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "computed", str(col("f1").upper_case)).watermark( "rowtime", str(source_watermark())).build()) self.t_env.create_temporary_view("t", table) # access and reorder columns reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t") # write out the rowtime column with fully declared schema result = self.t_env.to_changelog_stream( reordered, Schema.new_builder().column( "f1", DataTypes.STRING()).column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "ignored", str(col("f1").upper_case)).column( "f0", DataTypes.INT()).build()) # test event time window and field access result.key_by(lambda k: k.f1) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(A,47)', '(C,1000)', '(C,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_map_function_with_data_types(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) def map_func(value): result = Row(value[0], len(value[0]), value[1]) return result ds.map(map_func, output_type=Types.ROW([Types.STRING(), Types.INT(), Types.INT()]))\ .add_sink(self.test_sink) self.env.execute('map_function_test') results = self.test_sink.get_results(False) expected = ['ab,2,1', 'bdc,3,2', 'cfgs,4,3', 'deeefg,6,4'] expected.sort() results.sort() self.assertEqual(expected, results)
def test_map_function_with_data_types(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) def map_func(value): result = (value[0], len(value[0]), value[1]) return result mapped_stream = ds.map(map_func, type_info=Types.ROW( [Types.STRING(), Types.INT(), Types.INT()])) collect_util = DataStreamCollectUtil() collect_util.collect(mapped_stream) self.env.execute('map_function_test') results = collect_util.results() expected = ['ab,2,1', 'bdc,3,2', 'cfgs,4,3', 'deeefg,6,4'] expected.sort() results.sort() self.assertEqual(expected, results)
def test_from_java_type(self): basic_int_type_info = Types.INT() self.assertEqual(basic_int_type_info, _from_java_type(basic_int_type_info.get_java_type_info())) basic_short_type_info = Types.SHORT() self.assertEqual(basic_short_type_info, _from_java_type(basic_short_type_info.get_java_type_info())) basic_long_type_info = Types.LONG() self.assertEqual(basic_long_type_info, _from_java_type(basic_long_type_info.get_java_type_info())) basic_float_type_info = Types.FLOAT() self.assertEqual(basic_float_type_info, _from_java_type(basic_float_type_info.get_java_type_info())) basic_double_type_info = Types.DOUBLE() self.assertEqual(basic_double_type_info, _from_java_type(basic_double_type_info.get_java_type_info())) basic_char_type_info = Types.CHAR() self.assertEqual(basic_char_type_info, _from_java_type(basic_char_type_info.get_java_type_info())) basic_byte_type_info = Types.BYTE() self.assertEqual(basic_byte_type_info, _from_java_type(basic_byte_type_info.get_java_type_info())) basic_big_int_type_info = Types.BIG_INT() self.assertEqual(basic_big_int_type_info, _from_java_type(basic_big_int_type_info.get_java_type_info())) basic_big_dec_type_info = Types.BIG_DEC() self.assertEqual(basic_big_dec_type_info, _from_java_type(basic_big_dec_type_info.get_java_type_info())) basic_sql_date_type_info = Types.SQL_DATE() self.assertEqual(basic_sql_date_type_info, _from_java_type(basic_sql_date_type_info.get_java_type_info())) basic_sql_time_type_info = Types.SQL_TIME() self.assertEqual(basic_sql_time_type_info, _from_java_type(basic_sql_time_type_info.get_java_type_info())) basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP() self.assertEqual(basic_sql_timestamp_type_info, _from_java_type(basic_sql_timestamp_type_info.get_java_type_info())) row_type_info = Types.ROW([Types.INT(), Types.STRING()]) self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info())) tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()]) self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info())) primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT()) self.assertEqual(primitive_int_array_type_info, _from_java_type(primitive_int_array_type_info.get_java_type_info())) object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE()) self.assertEqual(object_array_type_info, _from_java_type(object_array_type_info.get_java_type_info())) pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY() self.assertEqual(pickled_byte_array_type_info, _from_java_type(pickled_byte_array_type_info.get_java_type_info())) sql_date_type_info = Types.SQL_DATE() self.assertEqual(sql_date_type_info, _from_java_type(sql_date_type_info.get_java_type_info())) map_type_info = Types.MAP(Types.INT(), Types.STRING()) self.assertEqual(map_type_info, _from_java_type(map_type_info.get_java_type_info())) list_type_info = Types.LIST(Types.INT()) self.assertEqual(list_type_info, _from_java_type(list_type_info.get_java_type_info()))
def ds_operators(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_python_executable( r"D:/ProgramData/Anaconda3/envs/penter/python.exe") ds = s_env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) """ map flat_map filter key_by DataStream → KeyedStream reduce KeyedStream → DataStream union DataStream* → DataStream connect DataStream,DataStream → ConnectedStreams 转换元组: project 分区: partition_custom 自定义分区 shuffle 随机分区 根据均匀分布随机划分元素。 rebalance 轮询分区 rescale 重新分区 broadcast 向每个分区广播元素 随意定制 process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。 其它 start_new_chain disable_chaining slot_sharing_group """ ds.rescale() ds.map() ds.flat_map() ds.filter() # KeyBy DataStream → KeyedStream # Reduce KeyedStream → DataStream ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.key_by(lambda a: a[1]) \ .reduce(lambda a, b: Row(a[0] + b[0], b[1])) # 广播 ds.broadcast() # project 只有元组ds才可以 ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]], type_info=Types.TUPLE([ Types.INT(), Types.INT(), Types.INT(), Types.INT() ])) # 输出元组的1,3索引 ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink() # 存储 ds.add_sink( StreamingFileSink.for_row_format( '/tmp/output', SimpleStringEncoder()).with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval( 15 * 60 * 1000).with_inactivity_interval( 5 * 60 * 1000).with_max_part_size(1024 * 1024 * 1024).build()). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) s_env.execute('ds_operators')