def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format(StreamFormat.text_line_format(), input_path) .process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source" ) else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy()) .build() ) else: print("Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def test_stream_file_sink(self): self.env.set_parallelism(2) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) ds.map( lambda a: a[0], Types.STRING()).add_sink( StreamingFileSink.for_row_format(self.tempdir, Encoder.simple_string_encoder()) .with_rolling_policy( RollingPolicy.default_rolling_policy( part_size=1024 * 1024 * 1024, rollover_interval=15 * 60 * 1000, inactivity_interval=5 * 60 * 1000)) .with_output_file_config( OutputFileConfig.OutputFileConfigBuilder() .with_part_prefix("prefix") .with_part_suffix("suffix").build()).build()) self.env.execute("test_streaming_file_sink") results = [] import os for root, dirs, files in os.walk(self.tempdir, topdown=True): for file in files: self.assertTrue(file.startswith('.prefix')) self.assertTrue('suffix' in file) path = root + "/" + file with open(path) as infile: for line in infile: results.append(line) expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n'] results.sort() expected.sort() self.assertEqual(expected, results)
def test_file_sink(self): base_path = "/tmp/1.txt" encoder = Encoder.simple_string_encoder() file_sink_builder = FileSink.for_row_format(base_path, encoder) file_sink = file_sink_builder\ .with_bucket_check_interval(1000) \ .with_bucket_assigner(BucketAssigner.base_path_bucket_assigner()) \ .with_rolling_policy(RollingPolicy.on_checkpoint_rolling_policy()) \ .with_output_file_config( OutputFileConfig.builder().with_part_prefix("pre").with_part_suffix("suf").build())\ .build() buckets_builder_field = \ load_java_class("org.apache.flink.connector.file.sink.FileSink"). \ getDeclaredField("bucketsBuilder") buckets_builder_field.setAccessible(True) buckets_builder = buckets_builder_field.get( file_sink.get_java_function()) self.assertEqual("DefaultRowFormatBuilder", buckets_builder.getClass().getSimpleName()) row_format_builder_clz = load_java_class( "org.apache.flink.connector.file.sink.FileSink$RowFormatBuilder") encoder_field = row_format_builder_clz.getDeclaredField("encoder") encoder_field.setAccessible(True) self.assertEqual( "SimpleStringEncoder", encoder_field.get(buckets_builder).getClass().getSimpleName()) interval_field = row_format_builder_clz.getDeclaredField( "bucketCheckInterval") interval_field.setAccessible(True) self.assertEqual(1000, interval_field.get(buckets_builder)) bucket_assigner_field = row_format_builder_clz.getDeclaredField( "bucketAssigner") bucket_assigner_field.setAccessible(True) self.assertEqual( "BasePathBucketAssigner", bucket_assigner_field.get( buckets_builder).getClass().getSimpleName()) rolling_policy_field = row_format_builder_clz.getDeclaredField( "rollingPolicy") rolling_policy_field.setAccessible(True) self.assertEqual( "OnCheckpointRollingPolicy", rolling_policy_field.get( buckets_builder).getClass().getSimpleName()) output_file_config_field = row_format_builder_clz.getDeclaredField( "outputFileConfig") output_file_config_field.setAccessible(True) output_file_config = output_file_config_field.get(buckets_builder) self.assertEqual("pre", output_file_config.getPartPrefix()) self.assertEqual("suf", output_file_config.getPartSuffix())
('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder() ).with_output_file_config(OutputFileConfig.builder().with_part_prefix( "prefix").with_part_suffix(".ext").build()).with_rolling_policy( RollingPolicy.default_rolling_policy()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()