def test_stream_file_sink(self): self.env.set_parallelism(2) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) ds.map(lambda a: a[0], Types.STRING()).add_sink( StreamingFileSink.for_row_format( self.tempdir, Encoder.simple_string_encoder()).with_rolling_policy( RollingPolicy.default_rolling_policy( part_size=1024 * 1024 * 1024, rollover_interval=15 * 60 * 1000, inactivity_interval=5 * 60 * 1000)). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) self.env.execute("test_streaming_file_sink") results = [] import os for root, dirs, files in os.walk(self.tempdir, topdown=True): for file in files: self.assertTrue(file.startswith('.prefix')) self.assertTrue('suffix' in file) path = root + "/" + file with open(path) as infile: for line in infile: results.append(line) expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n'] results.sort() expected.sort() self.assertEqual(expected, results)
def test_jdbc_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\ .with_driver_name('com.mysql.jdbc.Driver')\ .with_user_name('root')\ .with_password('password')\ .with_url('jdbc:mysql://server-name:server-port/database-name').build() jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\ .with_batch_size(100).with_max_retries(5).build() jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options, jdbc_execution_options) ds.add_sink(jdbc_sink).name('jdbc sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type']) j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat') connection_options = JdbcConnectionOptions( get_field_value(get_field_value(j_output_format, 'connectionProvider'), 'jdbcOptions')) self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url()) self.assertEqual(jdbc_connection_options.get_driver_name(), connection_options.get_driver_name()) self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password()) self.assertEqual(jdbc_connection_options.get_user_name(), connection_options.get_user_name()) exec_options = JdbcExecutionOptions(get_field_value(j_output_format, 'executionOptions')) self.assertEqual(jdbc_execution_options.get_batch_interval_ms(), exec_options.get_batch_interval_ms()) self.assertEqual(jdbc_execution_options.get_batch_size(), exec_options.get_batch_size()) self.assertEqual(jdbc_execution_options.get_max_retries(), exec_options.get_max_retries())
def test_from_java_type(self): basic_int_type_info = Types.INT() self.assertEqual(basic_int_type_info, _from_java_type(basic_int_type_info.get_java_type_info())) basic_short_type_info = Types.SHORT() self.assertEqual(basic_short_type_info, _from_java_type(basic_short_type_info.get_java_type_info())) basic_long_type_info = Types.LONG() self.assertEqual(basic_long_type_info, _from_java_type(basic_long_type_info.get_java_type_info())) basic_float_type_info = Types.FLOAT() self.assertEqual(basic_float_type_info, _from_java_type(basic_float_type_info.get_java_type_info())) basic_double_type_info = Types.DOUBLE() self.assertEqual(basic_double_type_info, _from_java_type(basic_double_type_info.get_java_type_info())) basic_char_type_info = Types.CHAR() self.assertEqual(basic_char_type_info, _from_java_type(basic_char_type_info.get_java_type_info())) basic_byte_type_info = Types.BYTE() self.assertEqual(basic_byte_type_info, _from_java_type(basic_byte_type_info.get_java_type_info())) basic_big_int_type_info = Types.BIG_INT() self.assertEqual(basic_big_int_type_info, _from_java_type(basic_big_int_type_info.get_java_type_info())) basic_big_dec_type_info = Types.BIG_DEC() self.assertEqual(basic_big_dec_type_info, _from_java_type(basic_big_dec_type_info.get_java_type_info())) basic_sql_date_type_info = Types.SQL_DATE() self.assertEqual(basic_sql_date_type_info, _from_java_type(basic_sql_date_type_info.get_java_type_info())) basic_sql_time_type_info = Types.SQL_TIME() self.assertEqual(basic_sql_time_type_info, _from_java_type(basic_sql_time_type_info.get_java_type_info())) basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP() self.assertEqual(basic_sql_timestamp_type_info, _from_java_type(basic_sql_timestamp_type_info.get_java_type_info())) row_type_info = Types.ROW([Types.INT(), Types.STRING()]) self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info())) tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()]) self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info())) primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT()) self.assertEqual(primitive_int_array_type_info, _from_java_type(primitive_int_array_type_info.get_java_type_info())) object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE()) self.assertEqual(object_array_type_info, _from_java_type(object_array_type_info.get_java_type_info())) pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY() self.assertEqual(pickled_byte_array_type_info, _from_java_type(pickled_byte_array_type_info.get_java_type_info())) sql_date_type_info = Types.SQL_DATE() self.assertEqual(sql_date_type_info, _from_java_type(sql_date_type_info.get_java_type_info())) map_type_info = Types.MAP(Types.INT(), Types.STRING()) self.assertEqual(map_type_info, _from_java_type(map_type_info.get_java_type_info())) list_type_info = Types.LIST(Types.INT()) self.assertEqual(list_type_info, _from_java_type(list_type_info.get_java_type_info()))
def test_generate_stream_graph_with_dependencies(self): python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join( python_file_dir, "test_stream_dependency_manage_lib.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") env = self.env env.add_python_file(python_file_path) def plus_two_map(value): from test_stream_dependency_manage_lib import add_two return value[0], add_two(value[1]) def add_from_file(i): with open("data/data.txt", 'r') as f: return i[0], i[1] + int(f.read()) from_collection_source = env.from_collection( [('a', 0), ('b', 0), ('c', 1), ('d', 1), ('e', 2)], type_info=Types.ROW([Types.STRING(), Types.INT()])) from_collection_source.name("From Collection") keyed_stream = from_collection_source.key_by(lambda x: x[1], key_type=Types.INT()) plus_two_map_stream = keyed_stream.map(plus_two_map).name( "Plus Two Map").set_parallelism(3) add_from_file_map = plus_two_map_stream.map(add_from_file).name( "Add From File Map") test_stream_sink = add_from_file_map.add_sink( self.test_sink).name("Test Sink") test_stream_sink.set_parallelism(4) archive_dir_path = os.path.join(self.tempdir, "archive_" + str(uuid.uuid4())) os.mkdir(archive_dir_path) with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f: f.write("3") archive_file_path = \ shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path) env.add_python_archive(archive_file_path, "data") nodes = eval(env.get_execution_plan())['nodes'] # The StreamGraph should be as bellow: # Source: From Collection -> _stream_key_by_map_operator -> # Plus Two Map -> Add From File Map -> Sink: Test Sink. # Source: From Collection and _stream_key_by_map_operator should have same parallelism. self.assertEqual(nodes[0]['parallelism'], nodes[1]['parallelism']) # The parallelism of Plus Two Map should be 3 self.assertEqual(nodes[2]['parallelism'], 3) # The ship_strategy for Source: From Collection and _stream_key_by_map_operator should be # FORWARD self.assertEqual(nodes[1]['predecessors'][0]['ship_strategy'], "FORWARD") # The ship_strategy for _keyed_stream_values_operator and Plus Two Map should be # HASH self.assertEqual(nodes[2]['predecessors'][0]['ship_strategy'], "HASH") # The parallelism of Sink: Test Sink should be 4 self.assertEqual(nodes[4]['parallelism'], 4) python_dependency_config = dict( get_gateway().jvm.org.apache.flink.python.util. PythonDependencyUtils.configurePythonDependencies( env._j_stream_execution_environment.getCachedFiles(), env._j_stream_execution_environment.getConfiguration()).toMap( )) # Make sure that user specified files and archives are correctly added. self.assertIsNotNone( python_dependency_config['python.internal.files-key-map']) self.assertIsNotNone( python_dependency_config['python.internal.archives-key-map'])
def test_from_collection_with_data_types(self): # verify from_collection for the collection with single object. ds = self.env.from_collection(['Hi', 'Hello'], type_info=Types.STRING()) ds.add_sink(self.test_sink) self.env.execute("test from collection with single object") results = self.test_sink.get_results(False) expected = ['Hello', 'Hi'] results.sort() expected.sort() self.assertEqual(expected, results) # verify from_collection for the collection with multiple objects like tuple. ds = self.env.from_collection( [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932, bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13), datetime.time(hour=12, minute=0, second=0, microsecond=123000), datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [1, 2, 3], decimal.Decimal('1000000000000000000.05'), decimal.Decimal('1000000000000000000.0599999999999' '9999899999999999')), (2, None, 2, True, 43878, 9147483648, 9.87, 2.98936, bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14), datetime.time(hour=11, minute=2, second=2, microsecond=234500), datetime.datetime(2020, 4, 15, 8, 2, 6, 235000), [2, 4, 6], decimal.Decimal('2000000000000000000.74'), decimal.Decimal('2000000000000000000.061111111111111' '11111111111111'))], type_info=Types.ROW([ Types.LONG(), Types.LONG(), Types.SHORT(), Types.BOOLEAN(), Types.SHORT(), Types.INT(), Types.FLOAT(), Types.DOUBLE(), Types.PICKLED_BYTE_ARRAY(), Types.STRING(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(), Types.BASIC_ARRAY(Types.LONG()), Types.BIG_DEC(), Types.BIG_DEC() ])) ds.add_sink(self.test_sink) self.env.execute("test from collection with tuple object") results = self.test_sink.get_results(False) # if user specifies data types of input data, the collected result should be in row format. expected = [ '+I[1, null, 1, true, 32767, -2147483648, 1.23, 1.98932, [102, 108, 105, 110, 107], ' 'pyflink, 2014-09-13, 12:00:00, 2018-03-11 03:00:00.123, [1, 2, 3], ' '1000000000000000000.05, 1000000000000000000.05999999999999999899999999999]', '+I[2, null, 2, true, -21658, 557549056, 9.87, 2.98936, [102, 108, 105, 110, 107], ' 'pyflink, 2015-10-14, 11:02:02, 2020-04-15 08:02:06.235, [2, 4, 6], ' '2000000000000000000.74, 2000000000000000000.06111111111111111111111111111]' ] results.sort() expected.sort() self.assertEqual(expected, results)
def partition_custom( self, partitioner: Union[Callable, Partitioner], key_selector: Union[Callable, KeySelector]) -> 'DataStream': """ Partitions a DataStream on the key returned by the selector, using a custom partitioner. This method takes the key selector to get the key to partition on, and a partitioner that accepts the key type. Note that this method works only on single field keys, i.e. the selector cannet return tuples of fields. :param partitioner: The partitioner to assign partitions to keys. :param key_selector: The KeySelector with which the DataStream is partitioned. :return: The partitioned DataStream. """ if callable(key_selector): key_selector = KeySelectorFunctionWrapper(key_selector) if not isinstance(key_selector, (KeySelector, KeySelectorFunctionWrapper)): raise TypeError( "Parameter key_selector should be a type of KeySelector.") if callable(partitioner): partitioner = PartitionerFunctionWrapper(partitioner) if not isinstance(partitioner, (Partitioner, PartitionerFunctionWrapper)): raise TypeError( "Parameter partitioner should be a type of Partitioner.") gateway = get_gateway() data_stream_num_partitions_env_key = gateway.jvm\ .org.apache.flink.datastream.runtime.operators.python\ .DataStreamPythonPartitionCustomFunctionOperator.DATA_STREAM_NUM_PARTITIONS class PartitionCustomMapFunction(MapFunction): """ A wrapper class for partition_custom map function. It indicates that it is a partition custom operation that we need to apply DataStreamPythonPartitionCustomFunctionOperator to run the map function. """ def __init__(self): self.num_partitions = None def map(self, value): return self.partition_custom_map(value) def partition_custom_map(self, value): if self.num_partitions is None: self.num_partitions = int( os.environ[data_stream_num_partitions_env_key]) partition = partitioner.partition(key_selector.get_key(value), self.num_partitions) return partition, value def __repr__(self) -> str: return '_Flink_PartitionCustomMapFunction' original_type_info = self.get_type() intermediate_map_stream = self.map( PartitionCustomMapFunction(), type_info=Types.ROW([Types.INT(), original_type_info])) intermediate_map_stream.name( gateway.jvm.org.apache.flink.python.util.PythonConfigUtil. STREAM_PARTITION_CUSTOM_MAP_OPERATOR_NAME) JPartitionCustomKeySelector = gateway.jvm\ .org.apache.flink.datastream.runtime.functions.python.PartitionCustomKeySelector JIdParitioner = gateway.jvm\ .org.apache.flink.api.java.functions.IdPartitioner intermediate_map_stream = DataStream( intermediate_map_stream._j_data_stream.partitionCustom( JIdParitioner(), JPartitionCustomKeySelector())) values_map_stream = intermediate_map_stream.map( lambda x: x[1], original_type_info) values_map_stream.name( gateway.jvm.org.apache.flink.python.util.PythonConfigUtil. KEYED_STREAM_VALUE_OPERATOR_NAME) return DataStream(values_map_stream._j_data_stream)
def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz): source_topic = 'test_source_topic' sink_topic = 'test_sink_topic' props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' } type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() flink_kafka_consumer = flink_kafka_consumer_clz( source_topic, deserialization_schema, props) flink_kafka_consumer.set_start_from_earliest() flink_kafka_consumer.set_commit_offsets_on_checkpoints(True) j_properties = get_field_value( flink_kafka_consumer.get_java_function(), 'properties') self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_properties.getProperty('group.id')) self.assertTrue( get_field_value(flink_kafka_consumer.get_java_function(), 'enableCommitOnCheckpoints')) j_start_up_mode = get_field_value( flink_kafka_consumer.get_java_function(), 'startupMode') j_deserializer = get_field_value( flink_kafka_consumer.get_java_function(), 'deserializer') j_deserialize_type_info = invoke_java_object_method( j_deserializer, "getProducedType") deserialize_type_info = typeinfo._from_java_type( j_deserialize_type_info) self.assertTrue(deserialize_type_info == type_info) self.assertTrue( j_start_up_mode.equals( get_gateway().jvm.org.apache.flink.streaming.connectors.kafka. config.StartupMode.EARLIEST)) j_topic_desc = get_field_value( flink_kafka_consumer.get_java_function(), 'topicsDescriptor') j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics') self.assertEqual(['test_source_topic'], list(j_topics)) # Test for kafka producer serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() flink_kafka_producer = flink_kafka_producer_clz( sink_topic, serialization_schema, props) flink_kafka_producer.set_write_timestamp_to_kafka(False) j_producer_config = get_field_value( flink_kafka_producer.get_java_function(), 'producerConfig') self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_producer_config.getProperty('group.id')) self.assertFalse( get_field_value(flink_kafka_producer.get_java_function(), 'writeTimestampToKafka'))
def test_pulsar_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled' pulsar_sink = PulsarSink.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_producer_name('fo') \ .set_topics('ada') \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \ .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \ .build() ds.sink_to(pulsar_sink).name('pulsar sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type']) configuration = get_field_value(pulsar_sink.get_java_function(), "sinkConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl').string_type(). no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl').string_type(). no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.producer.producerName').string_type( ).no_default_value()._j_config_option), 'fo - %s') j_pulsar_serialization_schema = get_field_value( pulsar_sink.get_java_function(), 'serializationSchema') j_serialization_schema = get_field_value(j_pulsar_serialization_schema, 'serializationSchema') self.assertTrue( is_instance_of( j_serialization_schema, 'org.apache.flink.api.common.serialization.SimpleStringSchema') ) self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type( ).no_default_value()._j_config_option), 'at-least-once') j_topic_router = get_field_value(pulsar_sink.get_java_function(), "topicRouter") self.assertTrue( is_instance_of( j_topic_router, 'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter' )) j_message_delayer = get_field_value(pulsar_sink.get_java_function(), 'messageDelayer') delay_duration = get_field_value(j_message_delayer, 'delayDuration') self.assertEqual(delay_duration, 12000) test_option = ConfigOptions.key( TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.producer.batchingMaxMessages'). long_type().no_default_value()._j_config_option), 100)
def demo01(): # 创建一个执行环境,该环境表示程序当前正在执行。如果程序是独立调用的,则方法返回本地执行环境。 # 1:创建一个流处理的执行环境,如果在本地启动则创建本地执行环境,如果在集群启动则创建集群执行环境 env = StreamExecutionEnvironment.get_execution_environment() # 添加添加到程序的每个用户代码类加载器的类路径中的url列表。路径必须指定一个协议(例如file://),并且可以在所有节点上访问 env.add_classpaths("file://lib") # 添加将被上传到集群并由作业引用的jar文件列表。 .set_string("pipeline.jars", 'file://' + dir_kafka_sql_connect) env.add_jars("file://jars") # 添加python存档文件。该文件将被解压到python UDF worker的工作目录中。 # 目前只支持zip格式,例如zip、jar、whl、egg等 # 会先解压zip -r py_env.zip py_env.zip env.add_python_archive("py_env.zip") # 如果python UDF依赖于集群中不存在的特定python版本,则可以使用此方法上传虚拟环境。注意,上传环境中包含的python解释器的路径应该通过该方法指定 env.set_python_executable("py_env.zip/py_env/bin/python") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 # or env.add_python_archive("py_env.zip", "myenv") env.set_python_executable("myenv/py_env/bin/python") # the files contained in the archive file can be accessed in UDF """ def my_udf(): with open("myenv/py_env/data/data.txt") as f: ... """ # 相当于 pip download -d cached_dir -r requirements.txt --no-binary :all: env.set_python_requirements("requirements.txt", "cached_dir") # 添加一个python依赖项,它可以是python文件、python包或本地目录。它们将被添加到python UDF工作者的PYTHONPATH中。请确保可以导入这些依赖项。 env.add_python_file("") # 添加source #1. add_source ds = env.add_source( FlinkKafkaConsumer( "source_topic", JsonRowDeserializationSchema.builder().type_info( type_info=Types.ROW([Types.INT(), Types.STRING()])).build(), { 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' })) # 2. from_collection ds = env.from_collection([ 1, 2, 3, ], Types.INT()) # 3. 从文件 ds = env.read_text_file("hdfs://host:port/file/path") # 禁用operator chaining env.disable_operator_chaining() """ Flink 可以非常高效的进行有状态流的计算,通过使用 Flink 内置的 Keyed State 和 Operator State,保存每个算子的状态。 默认情况下,状态是存储在 JVM 的堆内存中,如果系统中某个环节发生了错误,宕机,这个时候所有的状态都会丢失,并且无法恢复,会导致整个系统的数据计算发生错误。 此时就需要 Checkpoint 来保障系统的容错。Checkpoint 过程,就是把算子的状态周期性持久化的过程。 在系统出错后恢复时,就可以从 checkpoint 中恢复每个算子的状态,从上次消费的地方重新开始消费和计算。从而可以做到在高效进行计算的同时还可以保证数据不丢失,只计算一次。 最少一次 AT_LEAST_ONCE 如果假定是传输过程出现问题,而服务器没有收到数据,这样time out之后重传数据。但这可能是返回成功消息的时候出问题,而此时服务器已经收到数据,这样会因为重传而收到多份数据,这就是 at least once 严格一次 EXACTLY_ONCE 最多一次(At-most-once)、最少一次(At-least-once),以及严格一次(Exactly-once) Checkpoint 必要的两个条件 1. 需要支持重放一定时间范围内数据的数据源,比如:kafka 。 因为容错机制就是在任务失败后自动从最近一次成功的 checkpoint 处恢复任务,此时需要把任务失败前消费的数据再消费一遍。 假设数据源不支持重放,那么数据还未写到存储中就丢了,任务恢复后,就再也无法重新消费这部分丢了的数据了。 2. 需要一个存储来保存持久化的状态,如:Hdfs,本地文件。可以在任务失败后,从存储中恢复 checkpoint 数据。 https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/stream/state/checkpointing.html https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.datastream.html#pyflink.datastream.CheckpointConfig """ # 每 300s 做一次 checkpoint env.enable_checkpointing(300000, CheckpointingMode.AT_LEAST_ONCE) # MemoryStateBackend FsStateBackend CustomStateBackend env.set_state_backend(RocksDBStateBackend("file://var/checkpoints/")) # set mode to exactly-once (this is the default) env.get_checkpoint_config().set_checkpointing_mode( CheckpointingMode.EXACTLY_ONCE) # 两次 checkpoint 的间隔时间至少为 500ms,默认是 0,立即进行下一次 checkpoint make sure 500 ms of progress happen between checkpoints env.get_checkpoint_config().set_min_pause_between_checkpoints(500) # checkpoint 必须在 60s 内结束,否则被丢弃 checkpoints have to complete within one minute, or are discarded env.get_checkpoint_config().set_checkpoint_timeout(60000) # 同一时间只能允许有一个 checkpoint allow only one checkpoint to be in progress at the same time env.get_checkpoint_config().set_max_concurrent_checkpoints(1) # 当 Flink 任务取消时,保留外部保存的 checkpoint 信息 enable externalized checkpoints which are retained after job cancellation env.get_checkpoint_config().enable_externalized_checkpoints( ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) # 当有较新的 Savepoint 时,作业也会从 Checkpoint 处恢复 allow job recovery fallback to checkpoint when there is a more recent savepoint env.get_checkpoint_config().set_prefer_checkpoint_for_recovery(True) # 允许实验性的功能:非对齐的 checkpoint,以提升性能 enables the experimental unaligned checkpoints # CheckpointingMode.EXACTLY_ONCE时才能启用 env.get_checkpoint_config().enable_unaligned_checkpoints() # env.get_checkpoint_config().disable_unaligned_checkpoints() 等同env.get_checkpoint_config().enable_unaligned_checkpoints(False) env.get_checkpoint_interval( ) #等同 env.get_checkpoint_config().get_checkpoint_interval() """ """ # https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.common.html#pyflink.common.ExecutionConfig # bin/flink run -Dexecution.runtime-mode=BATCH examples/streaming/WordCount.jar env.get_config().set_execution_mode(ExecutionMode.BATCH) env.get_config().disable_auto_generated_uids( ) # enable_auto_generated_uids # 自己设置uid ds.uid("xx") # 设置从此环境创建的所有流的时间特性,例如,处理时间、事件时间或摄取时间。 # 如果将特征设置为EventTime的incertiontime,则将设置默认值水印更新间隔为200毫秒。 env.set_stream_time_characteristic(TimeCharacteristic.EventTime) #设置时间分配器 env.get_config().set_auto_watermark_interval(200) # 每200ms发出一个watermark env.get_config().set_global_job_parameters( {"environment.checkpoint_interval": "1000"}) env.get_config().set_restart_strategy( RestartStrategies.fixed_delay_restart(10, 1000)) # 执行 env.execute("job name") # 异步执行 jobClient = env.execute_async("job name") jobClient.get_job_execution_result().result() """ 设置输出缓冲区刷新的最大时间频率(毫秒)。默认情况下,输出缓冲区会频繁刷新,以提供较低的延迟,并帮助流畅的开发人员体验。设置该参数可以产生三种逻辑模式: 正整数触发该整数周期性刷新 0 触发每个记录之后的刷新,从而最大限度地减少延迟(最好不要设置为0 可以设置一个接近0的数值,比如5或者10) -1 仅在输出缓冲区已满时才触发刷新,从而最大化吞吐量 """ # 输出缓冲区刷新的最大时间频率(毫秒) env.get_buffer_timeout() env.set_buffer_timeout(10) # 获取执行计划的json,复制到https://flink.apache.org/visualizer/ env.get_execution_plan()
def ds_operators(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_python_executable( r"D:/ProgramData/Anaconda3/envs/penter/python.exe") ds = s_env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) """ map flat_map filter key_by DataStream → KeyedStream reduce KeyedStream → DataStream union DataStream* → DataStream connect DataStream,DataStream → ConnectedStreams 转换元组: project 分区: partition_custom 自定义分区 shuffle 随机分区 根据均匀分布随机划分元素。 rebalance 轮询分区 rescale 重新分区 broadcast 向每个分区广播元素 随意定制 process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。 其它 start_new_chain disable_chaining slot_sharing_group """ ds.rescale() ds.map() ds.flat_map() ds.filter() # KeyBy DataStream → KeyedStream # Reduce KeyedStream → DataStream ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.key_by(lambda a: a[1]) \ .reduce(lambda a, b: Row(a[0] + b[0], b[1])) # 广播 ds.broadcast() # project 只有元组ds才可以 ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]], type_info=Types.TUPLE([ Types.INT(), Types.INT(), Types.INT(), Types.INT() ])) # 输出元组的1,3索引 ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink() # 存储 ds.add_sink( StreamingFileSink.for_row_format( '/tmp/output', SimpleStringEncoder()).with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval( 15 * 60 * 1000).with_inactivity_interval( 5 * 60 * 1000).with_max_part_size(1024 * 1024 * 1024).build()). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) s_env.execute('ds_operators')
def test_keyed_process_function_with_state(self): self.env.set_parallelism(1) self.env.get_config().set_auto_watermark_interval(2000) self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) data_stream = self.env.from_collection( [(1, 'hi', '1603708211000'), (2, 'hello', '1603708224000'), (3, 'hi', '1603708226000'), (4, 'hello', '1603708289000'), (5, 'hi', '1603708291000'), (6, 'hello', '1603708293000')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) class MyTimestampAssigner(TimestampAssigner): def extract_timestamp(self, value, record_timestamp) -> int: return int(value[2]) class MyProcessFunction(KeyedProcessFunction): def __init__(self): self.value_state = None self.list_state = None self.map_state = None def open(self, runtime_context: RuntimeContext): value_state_descriptor = ValueStateDescriptor( 'value_state', Types.INT()) self.value_state = runtime_context.get_state( value_state_descriptor) list_state_descriptor = ListStateDescriptor( 'list_state', Types.INT()) self.list_state = runtime_context.get_list_state( list_state_descriptor) map_state_descriptor = MapStateDescriptor( 'map_state', Types.INT(), Types.STRING()) self.map_state = runtime_context.get_map_state( map_state_descriptor) def process_element(self, value, ctx): current_value = self.value_state.value() self.value_state.update(value[0]) current_list = [_ for _ in self.list_state.get()] self.list_state.add(value[0]) map_entries_string = [] for k, v in self.map_state.items(): map_entries_string.append(str(k) + ': ' + str(v)) map_entries_string = '{' + ', '.join(map_entries_string) + '}' self.map_state.put(value[0], value[1]) current_key = ctx.get_current_key() yield "current key: {}, current value state: {}, current list state: {}, " \ "current map state: {}, current value: {}".format(str(current_key), str(current_value), str(current_list), map_entries_string, str(value)) def on_timer(self, timestamp, ctx): pass watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[1], key_type_info=Types.STRING()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute( 'test time stamp assigner with keyed process function') result = self.test_sink.get_results() expected_result = [ "current key: hi, current value state: None, current list state: [], " "current map state: {}, current value: Row(f0=1, f1='hi', " "f2='1603708211000')", "current key: hello, current value state: None, " "current list state: [], current map state: {}, current value: Row(f0=2," " f1='hello', f2='1603708224000')", "current key: hi, current value state: 1, current list state: [1], " "current map state: {1: hi}, current value: Row(f0=3, f1='hi', " "f2='1603708226000')", "current key: hello, current value state: 2, current list state: [2], " "current map state: {2: hello}, current value: Row(f0=4, f1='hello', " "f2='1603708289000')", "current key: hi, current value state: 3, current list state: [1, 3], " "current map state: {1: hi, 3: hi}, current value: Row(f0=5, f1='hi', " "f2='1603708291000')", "current key: hello, current value state: 4, current list state: [2, 4]," " current map state: {2: hello, 4: hello}, current value: Row(f0=6, " "f1='hello', f2='1603708293000')" ] result.sort() expected_result.sort() self.assertEqual(expected_result, result)