def test_kinesis_firehose_sink(self): _load_specific_flink_module_jars('/flink-connectors/' 'flink-sql-connector-aws-kinesis-firehose') sink_properties = { 'aws.region': 'eu-west-1', 'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) kinesis_firehose_sink = KinesisFirehoseSink.builder() \ .set_firehose_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_delivery_stream_name('stream-1') \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis firehose sink: Writer', plan['nodes'][1]['type']) self.assertEqual(get_field_value(kinesis_firehose_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_firehose_sink.get_java_function(), 'deliveryStreamName'), 'stream-1')
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED(['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info(type_info).build() kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(watermark_strategy) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def test_kinesis_streams_sink(self): sink_properties = { 'aws.region': 'us-east-1', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) kinesis_streams_sink = KinesisStreamsSink.builder() \ .set_kinesis_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_partition_key_generator(PartitionKeyGenerator.fixed()) \ .set_stream_name("stream-1") \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_streams_sink).name('kinesis streams sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis streams sink: Writer', plan['nodes'][1]['type']) self.assertEqual(get_field_value(kinesis_streams_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_streams_sink.get_java_function(), 'streamName'), 'stream-1')
def test_sink_set_topics_with_list(self): PulsarSink.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics(['ada', 'beta']) \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .build()
def _build_source(initializer: KafkaOffsetsInitializer): return KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic') \ .set_value_only_deserializer(SimpleStringSchema()) \ .set_group_id('test_group') \ .set_unbounded(initializer) \ .build()
def run(): # 获取运行环境 env = StreamExecutionEnvironment.get_execution_environment() # 设置运行环境 env_setting(env) # 设置并行度 env.set_parallelism(1) # 添加jar文件 windows 系统改成自己的jar 所在文件地址 kafka_jar = f"file://{os.getcwd()}/jars/flink-connector-kafka_2.11-1.12.0.jar" kafka_client = f"file://{os.getcwd()}/jars/kafka-clients-2.4.1.jar" env.add_jars(kafka_jar, kafka_client) # 添加文件 env.add_python_file(f"{os.getcwd()}/config_file.py") env.add_python_file(f"{os.getcwd()}/env_setting.py") # 使用打包的运行环境 (自定义环境打包) env.add_python_archive(f"{os.getcwd()}/venv.zip") env.set_python_executable("env.zip/venv/bin/python") # 使用本地运行环境 # env.set_python_executable(PYTHON_EXECUTABLE) env.disable_operator_chaining() kafka_product_properties = get_kafka_Producer_properties(TEST_KAFKA_SERVERS) properties = get_kafka_customer_properties(TEST_KAFKA_SERVERS, TEST_GROUP_ID) data_stream = env.add_source( FlinkKafkaConsumer(topics=TEST_KAFKA_TOPIC, properties=properties, deserialization_schema=SimpleStringSchema()) \ .set_commit_offsets_on_checkpoints(True) ) \ .name(f"消费{TEST_KAFKA_TOPIC}主题数据") data_stream.map(lambda value: json.loads(s=value, encoding="utf-8")) \ .name("转成json") \ .map(lambda value: json.dumps(value), BasicTypeInfo.STRING_TYPE_INFO()) \ .name("转成str") \ .add_sink(FlinkKafkaProducer(topic=TEST_SINK_TOPIC, producer_config=kafka_product_properties, serialization_schema=SimpleStringSchema())) \ .name("存入kafka") env.execute("测试pyflink 读取和写入kafka")
def test_source_set_topics_pattern(self): PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics_pattern('ada.*') \ .set_subscription_name('ff') \ .set_deserialization_schema( PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \ .build()
def test_pulsar_source(self): TEST_OPTION_NAME = 'pulsar.source.enableAutoAcknowledgeMessage' pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics('ada') \ .set_start_cursor(StartCursor.earliest()) \ .set_unbounded_stop_cursor(StopCursor.never()) \ .set_bounded_stop_cursor(StopCursor.at_publish_time(22)) \ .set_subscription_name('ff') \ .set_subscription_type(SubscriptionType.Exclusive) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING())) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() ds = self.env.from_source(source=pulsar_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="pulsar source") ds.print() plan = eval(self.env.get_execution_plan()) self.assertEqual('Source: pulsar source', plan['nodes'][0]['type']) configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl') .string_type() .no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl') .string_type() .no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionName') .string_type() .no_default_value()._j_config_option), 'ff') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionType') .string_type() .no_default_value()._j_config_option), SubscriptionType.Exclusive.name) test_option = ConfigOptions.key(TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean( test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval') .long_type() .no_default_value()._j_config_option), 1000)
def test_simple_string_schema(self): expected_string = 'test string' simple_string_schema = SimpleStringSchema() self.assertEqual( expected_string.encode(encoding='utf-8'), simple_string_schema._j_serialization_schema.serialize( expected_string)) self.assertEqual( expected_string, simple_string_schema._j_deserialization_schema.deserialize( expected_string.encode(encoding='utf-8')))
def _check_key_serialization_schema(key_serialization_schema, expected_class): serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_key_serialization_schema(key_serialization_schema) \ .set_value_serialization_schema(SimpleStringSchema()) \ .build() schema_field = get_field_value( serialization_schema._j_serialization_schema, 'keySerializationSchema') self.assertIsNotNone(schema_field) self.assertEqual(schema_field.getClass().getCanonicalName(), expected_class)
def test_compiling(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() ds = self.env.from_source( source=source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name='kafka source') ds.print() plan = json.loads(self.env.get_execution_plan()) self.assertEqual('Source: kafka source', plan['nodes'][0]['type'])
def test_set_properties(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_group_id('test_group_id') \ .set_client_id_prefix('test_client_id_prefix') \ .set_property('test_property', 'test_value') \ .set_topics('test_topic') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() conf = self._get_kafka_source_configuration(source) self.assertEqual(conf.get_string('bootstrap.servers', ''), 'localhost:9092') self.assertEqual(conf.get_string('group.id', ''), 'test_group_id') self.assertEqual(conf.get_string('client.id.prefix', ''), 'test_client_id_prefix') self.assertEqual(conf.get_string('test_property', ''), 'test_value')
def test_kinesis_source(self): consumer_config = { 'aws.region': 'us-east-1', 'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key', 'flink.stream.initpos': 'LATEST' } kinesis_source = FlinkKinesisConsumer("stream-1", SimpleStringSchema(), consumer_config) ds = self.env.add_source(source_func=kinesis_source, source_name="kinesis source") ds.print() plan = eval(self.env.get_execution_plan()) self.assertEqual('Source: kinesis source', plan['nodes'][0]['type']) self.assertEqual( get_field_value(kinesis_source.get_java_function(), 'streams')[0], 'stream-1')
def test_set_topic_pattern(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topic_pattern('test_topic*') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() kafka_subscriber = get_field_value(source.get_java_function(), 'subscriber') self.assertEqual( kafka_subscriber.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.subscriber.TopicPatternSubscriber' ) topic_pattern = get_field_value(kafka_subscriber, 'topicPattern') self.assertTrue( is_instance_of(topic_pattern, get_gateway().jvm.java.util.regex.Pattern)) self.assertEqual(topic_pattern.toString(), 'test_topic*')
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) t_env = StreamTableEnvironment.create(stream_execution_environment=env) create_kafka_source_ddl = """ CREATE TABLE payment_msg( createTime VARCHAR, rt as TO_TIMESTAMP(createTime), orderId BIGINT, payAmount DOUBLE, payPlatform INT, provinceId INT, WATERMARK FOR rt as rt - INTERVAL '2' SECOND ) WITH ( 'connector.type' = 'kafka', 'connector.version' = 'universal', 'connector.topic' = 'timer-stream-source', 'connector.properties.bootstrap.servers' = 'localhost:9092', 'connector.properties.group.id' = 'test_3', 'connector.startup-mode' = 'earliest-offset', 'format.type' = 'json' ) """ t_env.execute_sql(create_kafka_source_ddl) t = t_env.from_path("payment_msg").select("createTime, orderId, payAmount, payPlatform," " provinceId") source_type_info = Types.ROW([ Types.STRING(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) ds = t_env.to_append_stream(table=t, type_info=source_type_info) producer_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'} kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), producer_props) ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def test_set_topics(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic1', 'test_topic2') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() kafka_subscriber = get_field_value(source.get_java_function(), 'subscriber') self.assertEqual( kafka_subscriber.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.subscriber.TopicListSubscriber' ) topics = get_field_value(kafka_subscriber, 'topics') self.assertTrue( is_instance_of(topics, get_gateway().jvm.java.util.List)) self.assertEqual(topics.size(), 2) self.assertEqual(topics[0], 'test_topic1') self.assertEqual(topics[1], 'test_topic2')
def tutorial(): env = StreamExecutionEnvironment.get_execution_environment() jar_files = ( 'flink-connector-kafka_2.12-1.12.2.jar', 'kafka-clients-2.4.1.jar', ) jar_paths = tuple('file://' + os.path.abspath(os.path.join(cur_path, jar_file)) for jar_file in jar_files) env.add_jars(*jar_paths) env.add_classpaths(*jar_paths) env.set_parallelism(1) ds = env.add_source( FlinkKafkaConsumer(TOPIC, SimpleStringSchema(), KAFKA_PROPERTIES)) ds.print() env.execute("tutorial_job")
def test_set_value_only_deserializer(self): def _check(schema: DeserializationSchema, class_name: str): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic') \ .set_value_only_deserializer(schema) \ .build() deserialization_schema_wrapper = get_field_value( source.get_java_function(), 'deserializationSchema') self.assertEqual( deserialization_schema_wrapper.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.reader.deserializer' '.KafkaValueOnlyDeserializationSchemaWrapper') deserialization_schema = get_field_value( deserialization_schema_wrapper, 'deserializationSchema') self.assertEqual( deserialization_schema.getClass().getCanonicalName(), class_name) _check(SimpleStringSchema(), 'org.apache.flink.api.common.serialization.SimpleStringSchema') _check( JsonRowDeserializationSchema.builder().type_info( Types.ROW([Types.STRING()])).build(), 'org.apache.flink.formats.json.JsonRowDeserializationSchema') _check( CsvRowDeserializationSchema.Builder(Types.ROW([Types.STRING() ])).build(), 'org.apache.flink.formats.csv.CsvRowDeserializationSchema') avro_schema_string = """ { "type": "record", "name": "test_record", "fields": [] } """ _check( AvroRowDeserializationSchema( avro_schema_string=avro_schema_string), 'org.apache.flink.formats.avro.AvroRowDeserializationSchema')
def test_set_partitions(self): topic_partition_1 = KafkaTopicPartition('test_topic', 1) topic_partition_2 = KafkaTopicPartition('test_topic', 2) source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_partitions({topic_partition_1, topic_partition_2}) \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() kafka_subscriber = get_field_value(source.get_java_function(), 'subscriber') self.assertEqual( kafka_subscriber.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.subscriber.PartitionSetSubscriber' ) partitions = get_field_value(kafka_subscriber, 'subscribedPartitions') self.assertTrue( is_instance_of(partitions, get_gateway().jvm.java.util.Set)) self.assertTrue( topic_partition_1._to_j_topic_partition() in partitions) self.assertTrue( topic_partition_2._to_j_topic_partition() in partitions)
def _check_serialization_schema_implementations(check_function): input_type = Types.ROW([Types.STRING()]) check_function( JsonRowSerializationSchema.builder().with_type_info( input_type).build(), 'org.apache.flink.formats.json.JsonRowSerializationSchema') check_function( CsvRowSerializationSchema.Builder(input_type).build(), 'org.apache.flink.formats.csv.CsvRowSerializationSchema') avro_schema_string = """ { "type": "record", "name": "test_record", "fields": [] } """ check_function( AvroRowSerializationSchema(avro_schema_string=avro_schema_string), 'org.apache.flink.formats.avro.AvroRowSerializationSchema') check_function( SimpleStringSchema(), 'org.apache.flink.api.common.serialization.SimpleStringSchema')
def _build_serialization_schema() -> KafkaRecordSerializationSchema: return KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_value_serialization_schema(SimpleStringSchema()) \ .build()
def test_pulsar_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled' pulsar_sink = PulsarSink.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_producer_name('fo') \ .set_topics('ada') \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \ .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \ .build() ds.sink_to(pulsar_sink).name('pulsar sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type']) configuration = get_field_value(pulsar_sink.get_java_function(), "sinkConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl').string_type(). no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl').string_type(). no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.producer.producerName').string_type( ).no_default_value()._j_config_option), 'fo - %s') j_pulsar_serialization_schema = get_field_value( pulsar_sink.get_java_function(), 'serializationSchema') j_serialization_schema = get_field_value(j_pulsar_serialization_schema, 'serializationSchema') self.assertTrue( is_instance_of( j_serialization_schema, 'org.apache.flink.api.common.serialization.SimpleStringSchema') ) self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type( ).no_default_value()._j_config_option), 'at-least-once') j_topic_router = get_field_value(pulsar_sink.get_java_function(), "topicRouter") self.assertTrue( is_instance_of( j_topic_router, 'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter' )) j_message_delayer = get_field_value(pulsar_sink.get_java_function(), 'messageDelayer') delay_duration = get_field_value(j_message_delayer, 'delayDuration') self.assertEqual(delay_duration, 12000) test_option = ConfigOptions.key( TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.producer.batchingMaxMessages'). long_type().no_default_value()._j_config_option), 100)