def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format(StreamFormat.text_line_format(), input_path) .process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source" ) else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy()) .build() ) else: print("Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def demo01(): env = StreamExecutionEnvironment.get_execution_environment() ds = env.from_collection( collection=[(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) # 给Event添加水位 # 1.内置水位生成策略 # 1.1 延迟生成水印: 延迟10s watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(10)) # 1.2 单调递增生成水印:这个也就是相当于上述的延迟策略去掉了延迟时间,以event中的时间戳充当了水印。 watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() # 2. event时间的获取 watermark_strategy = WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner()) """ 在某些情况下,由于数据产生的比较少,导致一段时间内没有数据产生,进而就没有水印的生成,导致下游依赖水印的一些操作就会出现问题,比如某一个算子的上游有多个算子, 这种情况下,水印是取其上游两个算子的较小值,如果上游某一个算子因为缺少数据迟迟没有生成水印,就会出现eventtime倾斜问题,导致下游没法触发计算。 所以filnk通过WatermarkStrategy.withIdleness()方法允许用户在配置的时间内(即超时时间内)没有记录到达时将一个流标记为空闲。这样就意味着下游的数据不需要等待水印的到来。 当下次有水印生成并发射到下游的时候,这个数据流重新变成活跃状态。 """ watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(10)).with_idleness(Duration.of_seconds(30)) ds.assign_timestamps_and_watermarks(watermark_strategy) ds.print()
def state_access_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 10) output_path = '/opt/examples/datastream/output/state_access' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='seq_num_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .map(MyMapFunction(), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .process(MyKeyedProcessFunction(), Types.LONG()) \ .sink_to(file_sink) env.execute('11-data_stream_state_access')
def event_timer_timer_demo(): env = StreamExecutionEnvironment.get_execution_environment() ds = env.from_collection( collection=[ (1000, 'Alice', 110.1), (4000, 'Bob', 30.2), (3000, 'Alice', 20.0), (2000, 'Bob', 53.1), (5000, 'Alice', 13.1), (3000, 'Bob', 3.1), (7000, 'Bob', 16.1), (10000, 'Alice', 20.1) ], type_info=Types.TUPLE([Types.LONG(), Types.STRING(), Types.FLOAT()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(2)) .with_timestamp_assigner(MyTimestampAssigner())) # apply the process function onto a keyed stream ds.key_by(lambda value: value[1]) \ .process(Sum()) \ .print() # submit for execution env.execute()
def data_stream_word_count_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) env.set_runtime_mode(RuntimeExecutionMode.BATCH) input_path = '/opt/examples/datastream/input/word_count_input' output_path = '/opt/examples/datastream/output/data_stream_word_count' file_source = FileSource\ .for_record_stream_format( StreamFormat.text_line_format(), input_path) \ .process_static_file_set() \ .build() file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=file_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.STRING()) ds.map(lambda a: Row(a, 1), output_type=Types.ROW([Types.STRING(), Types.INT()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_word_count')
def batch_seq_num_test(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(2) env.set_runtime_mode(RuntimeExecutionMode.BATCH) seq_num_source = NumberSequenceSource(1, 1000) output_path = '/opt/examples/output/batch_seq_num' file_sink = FileSink \ .for_row_format(output_path, Encoder.simple_string_encoder()) \ .with_output_file_config(OutputFileConfig.builder().with_part_prefix('pre').with_part_suffix('suf').build()) \ .build() ds = env.from_source( source=seq_num_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='file_source', type_info=Types.LONG()) ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \ .key_by(lambda a: a[0]) \ .reduce(lambda a, b: Row(a[0], a[1] + b[1])) \ .sink_to(file_sink) env.execute('9-data_stream_batch_seq_num')
def test_pulsar_source(self): TEST_OPTION_NAME = 'pulsar.source.enableAutoAcknowledgeMessage' pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics('ada') \ .set_start_cursor(StartCursor.earliest()) \ .set_unbounded_stop_cursor(StopCursor.never()) \ .set_bounded_stop_cursor(StopCursor.at_publish_time(22)) \ .set_subscription_name('ff') \ .set_subscription_type(SubscriptionType.Exclusive) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING())) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() ds = self.env.from_source(source=pulsar_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="pulsar source") ds.print() plan = eval(self.env.get_execution_plan()) self.assertEqual('Source: pulsar source', plan['nodes'][0]['type']) configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl') .string_type() .no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl') .string_type() .no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionName') .string_type() .no_default_value()._j_config_option), 'ff') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionType') .string_type() .no_default_value()._j_config_option), SubscriptionType.Exclusive.name) test_option = ConfigOptions.key(TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean( test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval') .long_type() .no_default_value()._j_config_option), 1000)
def _build_csv_job(self, schema, lines): with open(self.csv_file_name, 'w') as f: for line in lines: f.write(line) source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \ .add_sink(self.test_sink)
def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection([(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps() .with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream(ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) self.assertEqual("""( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view("t", ds, Schema.new_builder() .column_by_metadata("rowtime", "TIMESTAMP_LTZ(3)") .watermark("rowtime", "SOURCE_WATERMARK()") .build()) result = self.t_env.execute_sql("SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [item for item in map(str, [Row('a', 47), Row('c', 1000), Row('c', 1000)])] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def _build_csv_job(self, schema: CsvSchema, lines): with open(self.csv_file_name, 'w') as f: for line in lines: f.write(line) source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') sink = FileSink.for_bulk_format( self.csv_dir_name, CsvBulkWriters.for_schema(schema)).build() ds.sink_to(sink)
def test_from_and_to_changelog_stream_event_time(self): from pyflink.table import Schema self.env.set_parallelism(1) ds = self.env.from_collection( [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW([Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner())) changelog_stream = ds.map(lambda t: Row(t.f1, t.f2), Types.ROW([Types.INT(), Types.STRING()])) # derive physical columns and add a rowtime table = self.t_env.from_changelog_stream( changelog_stream, Schema.new_builder().column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "computed", str(col("f1").upper_case)).watermark( "rowtime", str(source_watermark())).build()) self.t_env.create_temporary_view("t", table) # access and reorder columns reordered = self.t_env.sql_query("SELECT computed, rowtime, f0 FROM t") # write out the rowtime column with fully declared schema result = self.t_env.to_changelog_stream( reordered, Schema.new_builder().column( "f1", DataTypes.STRING()).column_by_metadata( "rowtime", DataTypes.TIMESTAMP_LTZ(3)).column_by_expression( "ignored", str(col("f1").upper_case)).column( "f0", DataTypes.INT()).build()) # test event time window and field access result.key_by(lambda k: k.f1) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(A,47)', '(C,1000)', '(C,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
output_path = known_args.output env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder() ).with_output_file_config(OutputFileConfig.builder().with_part_prefix( "prefix").with_part_suffix(".ext").build()).with_rolling_policy( RollingPolicy.default_rolling_policy()).build()) else:
.set_service_url(SERVICE_URL) \ .set_admin_url(ADMIN_URL) \ .set_topics('ada') \ .set_start_cursor(StartCursor.latest()) \ .set_unbounded_stop_cursor(StopCursor.never()) \ .set_subscription_name('pyflink_subscription') \ .set_subscription_type(SubscriptionType.Exclusive) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \ .set_config('pulsar.source.enableAutoAcknowledgeMessage', True) \ .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() ds = env.from_source( source=pulsar_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="pulsar source") pulsar_sink = PulsarSink.builder() \ .set_service_url(SERVICE_URL) \ .set_admin_url(ADMIN_URL) \ .set_producer_name('pyflink_producer') \ .set_topics('beta') \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \ .set_config('pulsar.producer.maxPendingMessages', 1000) \ .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \ .build()