def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format(StreamFormat.text_line_format(), input_path) .process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source" ) else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy()) .build() ) else: print("Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def state_access_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 10) output_path = '/opt/examples/datastream/output/state_access' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='seq_num_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .map(MyMapFunction(), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .process(MyKeyedProcessFunction(), Types.LONG()) \ .sink_to(file_sink) env.execute('11-data_stream_state_access')
def test_stream_file_sink(self): self.env.set_parallelism(2) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds.map( lambda a: a[0], Types.STRING()).add_sink( StreamingFileSink.for_row_format(self.tempdir, SimpleStringEncoder()) .with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval(15 * 60 * 1000) .with_inactivity_interval(5 * 60 * 1000) .with_max_part_size(1024 * 1024 * 1024).build()) .with_output_file_config( OutputFileConfig.OutputFileConfigBuilder() .with_part_prefix("prefix") .with_part_suffix("suffix").build()).build()) self.env.execute("test_streaming_file_sink") results = [] import os for root, dirs, files in os.walk(self.tempdir, topdown=True): for file in files: self.assertTrue(file.startswith('.prefix')) self.assertTrue('suffix' in file) path = root + "/" + file with open(path) as infile: for line in infile: results.append(line) expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n'] results.sort() expected.sort() self.assertEqual(expected, results)
def data_stream_word_count_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) input_path = '/opt/examples/datastream/input/word_count_input' output_path = '/opt/examples/datastream/output/data_stream_word_count' file_source = FileSource\ .for_record_stream_format( StreamFormat.text_line_format(), input_path) \ .process_static_file_set() \ .build() file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=file_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.STRING()) ds.map(lambda a: Row(a, 1), output_type=Types.ROW([Types.STRING(), Types.INT()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_word_count')
def batch_seq_num_test(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(2) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 1000) output_path = '/opt/examples/output/batch_seq_num' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_batch_seq_num')
def test_file_sink(self): base_path = "/tmp/1.txt" encoder = Encoder.simple_string_encoder() file_sink_builder = FileSink.for_row_format(base_path, encoder) file_sink = file_sink_builder\ .with_bucket_check_interval(1000) \ .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \ .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \ .with_output_file_config( OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\ .build() buckets_builder_field = \ load_java_class("org.apache.flink.connector.file.sink.FileSink"). \ getDeclaredField("bucketsBuilder") buckets_builder_field.setAccessible(True) buckets_builder = buckets_builder_field.get( file_sink.get_java_function()) self.assertEqual("DefaultRowFormatBuilder", buckets_builder.getClass().getSimpleName()) row_format_builder_clz = load_java_class( "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder") encoder_field = row_format_builder_clz.getDeclaredField("encoder") encoder_field.setAccessible(True) self.assertEqual( "SimpleStringEncoder", encoder_field.get(buckets_builder).getClass().getSimpleName()) interval_field = row_format_builder_clz.getDeclaredField( "bucketCheckInterval") interval_field.setAccessible(True) self.assertEqual(1000, interval_field.get(buckets_builder)) bucket_assigner_field = row_format_builder_clz.getDeclaredField( "bucketAssigner") bucket_assigner_field.setAccessible(True) self.assertEqual( "BasePathBucketAssigner", bucket_assigner_field.get( buckets_builder).getClass().getSimpleName()) rolling_policy_field = row_format_builder_clz.getDeclaredField( "rollingPolicy") rolling_policy_field.setAccessible(True) self.assertEqual( "OnCheckpointRollingPolicy", rolling_policy_field.get( buckets_builder).getClass().getSimpleName()) output_file_config_field = row_format_builder_clz.getDeclaredField( "outputFileConfig") output_file_config_field.setAccessible(True) output_file_config = output_file_config_field.get(buckets_builder) self.assertEqual("pre", output_file_config.getPartPrefix()) self.assertEqual("suf", output_file_config.getPartSuffix())
('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder() ).with_output_file_config(OutputFileConfig.builder().with_part_prefix( "prefix").with_part_suffix(".ext").build()).with_rolling_policy( RollingPolicy.default_rolling_policy()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def ds_operators(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_python_executable( r"D:/ProgramData/Anaconda3/envs/penter/python.exe") ds = s_env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) """ map flat_map filter key_by DataStream → KeyedStream reduce KeyedStream → DataStream union DataStream* → DataStream connect DataStream,DataStream → ConnectedStreams 转换元组: project 分区: partition_custom 自定义分区 shuffle 随机分区 根据均匀分布随机划分元素。 rebalance 轮询分区 rescale 重新分区 broadcast 向每个分区广播元素 随意定制 process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。 其它 start_new_chain disable_chaining slot_sharing_group """ ds.rescale() ds.map() ds.flat_map() ds.filter() # KeyBy DataStream → KeyedStream # Reduce KeyedStream → DataStream ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.key_by(lambda a: a[1]) \ .reduce(lambda a, b: Row(a[0] + b[0], b[1])) # 广播 ds.broadcast() # project 只有元组ds才可以 ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]], type_info=Types.TUPLE([ Types.INT(), Types.INT(), Types.INT(), Types.INT() ])) # 输出元组的1,3索引 ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink() # 存储 ds.add_sink( StreamingFileSink.for_row_format( '/tmp/output', SimpleStringEncoder()).with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval( 15 * 60 * 1000).with_inactivity_interval( 5 * 60 * 1000).with_max_part_size(1024 * 1024 * 1024).build()). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) s_env.execute('ds_operators')