def test_stream_file_sink(self): self.env.set_parallelism(2) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds.map( lambda a: a[0], Types.STRING()).add_sink( StreamingFileSink.for_row_format(self.tempdir, SimpleStringEncoder()) .with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval(15 * 60 * 1000) .with_inactivity_interval(5 * 60 * 1000) .with_max_part_size(1024 * 1024 * 1024).build()) .with_output_file_config( OutputFileConfig.OutputFileConfigBuilder() .with_part_prefix("prefix") .with_part_suffix("suffix").build()).build()) self.env.execute("test_streaming_file_sink") results = [] import os for root, dirs, files in os.walk(self.tempdir, topdown=True): for file in files: self.assertTrue(file.startswith('.prefix')) self.assertTrue('suffix' in file) path = root + "/" + file with open(path) as infile: for line in infile: results.append(line) expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n'] results.sort() expected.sort() self.assertEqual(expected, results)
def ds_operators(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_python_executable( r"D:/ProgramData/Anaconda3/envs/penter/python.exe") ds = s_env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) """ map flat_map filter key_by DataStream → KeyedStream reduce KeyedStream → DataStream union DataStream* → DataStream connect DataStream,DataStream → ConnectedStreams 转换元组: project 分区: partition_custom 自定义分区 shuffle 随机分区 根据均匀分布随机划分元素。 rebalance 轮询分区 rescale 重新分区 broadcast 向每个分区广播元素 随意定制 process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。 其它 start_new_chain disable_chaining slot_sharing_group """ ds.rescale() ds.map() ds.flat_map() ds.filter() # KeyBy DataStream → KeyedStream # Reduce KeyedStream → DataStream ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.key_by(lambda a: a[1]) \ .reduce(lambda a, b: Row(a[0] + b[0], b[1])) # 广播 ds.broadcast() # project 只有元组ds才可以 ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]], type_info=Types.TUPLE([ Types.INT(), Types.INT(), Types.INT(), Types.INT() ])) # 输出元组的1,3索引 ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink() # 存储 ds.add_sink( StreamingFileSink.for_row_format( '/tmp/output', SimpleStringEncoder()).with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval( 15 * 60 * 1000).with_inactivity_interval( 5 * 60 * 1000).with_max_part_size(1024 * 1024 * 1024).build()). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) s_env.execute('ds_operators')