def _check_specified_offsets_initializer( self, source: KafkaSource, offsets: Dict[KafkaTopicPartition, int], reset_strategy: KafkaOffsetResetStrategy, is_start: bool = True): if is_start: field_name = 'startingOffsetsInitializer' else: field_name = 'stoppingOffsetsInitializer' offsets_initializer = get_field_value(source.get_java_function(), field_name) self.assertEqual( offsets_initializer.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.initializer' '.SpecifiedOffsetsInitializer') initial_offsets = get_field_value(offsets_initializer, 'initialOffsets') self.assertTrue( is_instance_of(initial_offsets, get_gateway().jvm.java.util.Map)) self.assertEqual(initial_offsets.size(), len(offsets)) for j_topic_partition in initial_offsets: topic_partition = KafkaTopicPartition( j_topic_partition.topic(), j_topic_partition.partition()) self.assertIsNotNone(offsets.get(topic_partition)) self.assertEqual(initial_offsets[j_topic_partition], offsets[topic_partition]) offset_reset_strategy = get_field_value(offsets_initializer, 'offsetResetStrategy') self.assertTrue( offset_reset_strategy.equals( reset_strategy._to_j_offset_reset_strategy()))
def _check_reader_handled_offsets_initializer( self, source: KafkaSource, offset: int, reset_strategy: KafkaOffsetResetStrategy, is_start: bool = True): if is_start: field_name = 'startingOffsetsInitializer' else: field_name = 'stoppingOffsetsInitializer' offsets_initializer = get_field_value(source.get_java_function(), field_name) self.assertEqual( offsets_initializer.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.initializer' '.ReaderHandledOffsetsInitializer') starting_offset = get_field_value(offsets_initializer, 'startingOffset') self.assertEqual(starting_offset, offset) offset_reset_strategy = get_field_value(offsets_initializer, 'offsetResetStrategy') self.assertTrue( offset_reset_strategy.equals( reset_strategy._to_j_offset_reset_strategy()))
def test_rabbitmq_connectors(self): connection_config = RMQConnectionConfig.Builder() \ .set_host('localhost') \ .set_port(5672) \ .set_virtual_host('/') \ .set_user_name('guest') \ .set_password('guest') \ .build() type_info = Types.ROW([Types.INT(), Types.STRING()]) deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() rmq_source = RMQSource(connection_config, 'source_queue', True, deserialization_schema) self.assertEqual( get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue') self.assertTrue( get_field_value(rmq_source.get_java_function(), 'usesCorrelationId')) serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema) self.assertEqual( get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
def test_set_delivery_guarantee(self): sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_record_serializer(self._build_serialization_schema()) \ .build() guarantee = get_field_value(sink.get_java_function(), 'deliveryGuarantee') self.assertEqual(guarantee.toString(), 'none') sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_record_serializer(self._build_serialization_schema()) \ .build() guarantee = get_field_value(sink.get_java_function(), 'deliveryGuarantee') self.assertEqual(guarantee.toString(), 'at-least-once') sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_delivery_guarantee(DeliveryGuarantee.EXACTLY_ONCE) \ .set_record_serializer(self._build_serialization_schema()) \ .build() guarantee = get_field_value(sink.get_java_function(), 'deliveryGuarantee') self.assertEqual(guarantee.toString(), 'exactly-once')
def test_kinesis_firehose_sink(self): _load_specific_flink_module_jars('/flink-connectors/' 'flink-sql-connector-aws-kinesis-firehose') sink_properties = { 'aws.region': 'eu-west-1', 'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) kinesis_firehose_sink = KinesisFirehoseSink.builder() \ .set_firehose_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_delivery_stream_name('stream-1') \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis firehose sink: Writer', plan['nodes'][1]['type']) self.assertEqual(get_field_value(kinesis_firehose_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_firehose_sink.get_java_function(), 'deliveryStreamName'), 'stream-1')
def test_kinesis_streams_sink(self): sink_properties = { 'aws.region': 'us-east-1', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW([Types.STRING(), Types.INT()])) kinesis_streams_sink = KinesisStreamsSink.builder() \ .set_kinesis_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_partition_key_generator(PartitionKeyGenerator.fixed()) \ .set_stream_name("stream-1") \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_streams_sink).name('kinesis streams sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis streams sink: Writer', plan['nodes'][1]['type']) self.assertEqual(get_field_value(kinesis_streams_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_streams_sink.get_java_function(), 'streamName'), 'stream-1')
def startup_loopback_server(): from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer jvm = gateway.jvm j_env = jvm.System.getenv() get_field_value(j_env, "m").put( 'PYFLINK_LOOPBACK_SERVER_ADDRESS', BeamFnLoopbackWorkerPoolServicer().start())
def test_es_sink_dynamic(self): ds = self.env.from_collection([{ 'name': 'ada', 'id': '1' }, { 'name': 'luna', 'id': '2' }], type_info=Types.MAP( Types.STRING(), Types.STRING())) es_dynamic_index_sink = Elasticsearch7SinkBuilder() \ .set_emitter(ElasticsearchEmitter.dynamic_index('name', 'id')) \ .set_hosts(['localhost:9200']) \ .build() j_emitter = get_field_value(es_dynamic_index_sink.get_java_function(), 'emitter') self.assertTrue( is_instance_of( j_emitter, 'org.apache.flink.connector.elasticsearch.sink.SimpleElasticsearchEmitter' )) ds.sink_to(es_dynamic_index_sink).name('es dynamic index sink')
def startup_loopback_server(): jvm = gateway.jvm env_config = JPythonConfigUtil.getEnvironmentConfig( self._j_stream_execution_environment) parallelism = self.get_parallelism() if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()): import logging logging.warning("Loopback mode is disabled as python archives are used and the " "parallelism of the job is greater than 1. The Python user-defined " "functions will be executed in an independent Python process.") else: from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer j_env = jvm.System.getenv() get_field_value(j_env, "m").put( 'PYFLINK_LOOPBACK_SERVER_ADDRESS', BeamFnLoopbackWorkerPoolServicer().start())
def test_set_topic_pattern(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topic_pattern('test_topic*') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() kafka_subscriber = get_field_value(source.get_java_function(), 'subscriber') self.assertEqual( kafka_subscriber.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.subscriber.TopicPatternSubscriber' ) topic_pattern = get_field_value(kafka_subscriber, 'topicPattern') self.assertTrue( is_instance_of(topic_pattern, get_gateway().jvm.java.util.regex.Pattern)) self.assertEqual(topic_pattern.toString(), 'test_topic*')
def _check(schema: DeserializationSchema, class_name: str): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic') \ .set_value_only_deserializer(schema) \ .build() deserialization_schema_wrapper = get_field_value( source.get_java_function(), 'deserializationSchema') self.assertEqual( deserialization_schema_wrapper.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.reader.deserializer' '.KafkaValueOnlyDeserializationSchemaWrapper') deserialization_schema = get_field_value( deserialization_schema_wrapper, 'deserializationSchema') self.assertEqual( deserialization_schema.getClass().getCanonicalName(), class_name)
def test_jdbc_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\ .with_driver_name('com.mysql.jdbc.Driver')\ .with_user_name('root')\ .with_password('password')\ .with_url('jdbc:mysql://server-name:server-port/database-name').build() jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\ .with_batch_size(100).with_max_retries(5).build() jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options, jdbc_execution_options) ds.add_sink(jdbc_sink).name('jdbc sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type']) j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat') connection_options = JdbcConnectionOptions( get_field_value( get_field_value(j_output_format, 'connectionProvider'), 'jdbcOptions')) self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url()) self.assertEqual(jdbc_connection_options.get_driver_name(), connection_options.get_driver_name()) self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password()) self.assertEqual(jdbc_connection_options.get_user_name(), connection_options.get_user_name()) exec_options = JdbcExecutionOptions( get_field_value(j_output_format, 'executionOptions')) self.assertEqual(jdbc_execution_options.get_batch_interval_ms(), exec_options.get_batch_interval_ms()) self.assertEqual(jdbc_execution_options.get_batch_size(), exec_options.get_batch_size()) self.assertEqual(jdbc_execution_options.get_max_retries(), exec_options.get_max_retries())
def _check_timestamp_offsets_initializer(self, source: KafkaSource, timestamp: int, is_start: bool = True): if is_start: field_name = 'startingOffsetsInitializer' else: field_name = 'stoppingOffsetsInitializer' offsets_initializer = get_field_value(source.get_java_function(), field_name) self.assertEqual( offsets_initializer.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.initializer' '.TimestampOffsetsInitializer') starting_timestamp = get_field_value(offsets_initializer, 'startingTimestamp') self.assertEqual(starting_timestamp, timestamp)
def test_set_transactional_id_prefix(self): sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_transactional_id_prefix('test-prefix') \ .set_record_serializer(self._build_serialization_schema()) \ .build() prefix = get_field_value(sink.get_java_function(), 'transactionalIdPrefix') self.assertEqual(prefix, 'test-prefix')
def test_set_property(self): sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_record_serializer(self._build_serialization_schema()) \ .set_property('test-key', 'test-value') \ .build() config = get_field_value(sink.get_java_function(), 'kafkaProducerConfig') self.assertEqual(config.get('test-key'), 'test-value')
def test_pulsar_source(self): TEST_OPTION_NAME = 'pulsar.source.enableAutoAcknowledgeMessage' pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics('ada') \ .set_start_cursor(StartCursor.earliest()) \ .set_unbounded_stop_cursor(StopCursor.never()) \ .set_bounded_stop_cursor(StopCursor.at_publish_time(22)) \ .set_subscription_name('ff') \ .set_subscription_type(SubscriptionType.Exclusive) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING())) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() ds = self.env.from_source(source=pulsar_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="pulsar source") ds.print() plan = eval(self.env.get_execution_plan()) self.assertEqual('Source: pulsar source', plan['nodes'][0]['type']) configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl') .string_type() .no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl') .string_type() .no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionName') .string_type() .no_default_value()._j_config_option), 'ff') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionType') .string_type() .no_default_value()._j_config_option), SubscriptionType.Exclusive.name) test_option = ConfigOptions.key(TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean( test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval') .long_type() .no_default_value()._j_config_option), 1000)
def test_set_topics(self): source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_topics('test_topic1', 'test_topic2') \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() kafka_subscriber = get_field_value(source.get_java_function(), 'subscriber') self.assertEqual( kafka_subscriber.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.subscriber.TopicListSubscriber' ) topics = get_field_value(kafka_subscriber, 'topics') self.assertTrue( is_instance_of(topics, get_gateway().jvm.java.util.List)) self.assertEqual(topics.size(), 2) self.assertEqual(topics[0], 'test_topic1') self.assertEqual(topics[1], 'test_topic2')
def test_set_record_serializer(self): sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_record_serializer(self._build_serialization_schema()) \ .build() serializer = get_field_value(sink.get_java_function(), 'recordSerializer') self.assertEqual( serializer.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.sink.' 'KafkaRecordSerializationSchemaBuilder.' 'KafkaRecordSerializationSchemaWrapper') topic_selector = get_field_value(serializer, 'topicSelector') self.assertEqual(topic_selector.apply(None), 'test-topic') value_serializer = get_field_value(serializer, 'valueSerializationSchema') self.assertEqual( value_serializer.getClass().getCanonicalName(), 'org.apache.flink.api.common.serialization.SimpleStringSchema')
def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \ -> JavaObject: gateway = get_gateway() JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster j_configuration = get_j_env_configuration(self._j_stream_execution_environment) def startup_loopback_server(): jvm = gateway.jvm env_config = JPythonConfigUtil.getEnvironmentConfig( self._j_stream_execution_environment) parallelism = self.get_parallelism() if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()): import logging logging.warning("Loopback mode is disabled as python archives are used and the " "parallelism of the job is greater than 1. The Python user-defined " "functions will be executed in an independent Python process.") else: from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer j_env = jvm.System.getenv() get_field_value(j_env, "m").put( 'PYFLINK_LOOPBACK_SERVER_ADDRESS', BeamFnLoopbackWorkerPoolServicer().start()) python_worker_execution_mode = None if hasattr(self, "_python_worker_execution_mode"): python_worker_execution_mode = getattr(self, "_python_worker_execution_mode") if python_worker_execution_mode is None: if is_local_deployment(j_configuration): startup_loopback_server() elif python_worker_execution_mode == 'loopback': if is_local_deployment(j_configuration): startup_loopback_server() else: raise ValueError("Loopback mode is enabled, however the job wasn't configured to " "run in local deployment mode") elif python_worker_execution_mode != 'process': raise ValueError( "It only supports to execute the Python worker in 'loopback' mode and 'process' " "mode, unknown mode '%s' is configured" % python_worker_execution_mode) JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment) gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply( self._j_stream_execution_environment) JPythonConfigUtil.setPartitionCustomOperatorNumPartitions( get_field_value(self._j_stream_execution_environment, "transformations")) j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations) if job_name is not None: j_stream_graph.setJobName(job_name) return j_stream_graph
def test_with_idleness(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.no_watermarks().with_idleness( Duration.of_seconds(5))._j_watermark_strategy self.assertTrue( is_instance_of( j_watermark_strategy, jvm.org.apache.flink.api.common. eventtime.WatermarkStrategyWithIdleness)) self.assertEqual( get_field_value(j_watermark_strategy, "idlenessTimeout").toMillis(), 5000)
def _check_value_serialization_schema(value_serialization_schema, expected_class): serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_value_serialization_schema(value_serialization_schema) \ .build() schema_field = get_field_value( serialization_schema._j_serialization_schema, 'valueSerializationSchema') self.assertIsNotNone(schema_field) self.assertEqual(schema_field.getClass().getCanonicalName(), expected_class)
def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \ -> JavaObject: gateway = get_gateway() JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster j_configuration = get_j_env_configuration( self._j_stream_execution_environment) def startup_loopback_server(): from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer jvm = gateway.jvm j_env = jvm.System.getenv() get_field_value(j_env, "m").put( 'PYFLINK_LOOPBACK_SERVER_ADDRESS', BeamFnLoopbackWorkerPoolServicer().start()) python_worker_execution_mode = None if hasattr(self, "_python_worker_execution_mode"): python_worker_execution_mode = getattr( self, "_python_worker_execution_mode") if python_worker_execution_mode is None: if is_local_deployment(j_configuration): startup_loopback_server() elif python_worker_execution_mode == 'loopback': if is_local_deployment(j_configuration): startup_loopback_server() else: raise ValueError( "Loopback mode is enabled, however the job wasn't configured to " "run in local deployment mode") elif python_worker_execution_mode != 'process': raise ValueError( "It only supports to execute the Python worker in 'loopback' mode and 'process' " "mode, unknown mode '%s' is configured" % python_worker_execution_mode) JPythonConfigUtil.configPythonOperator( self._j_stream_execution_environment) gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply( self._j_stream_execution_environment) JPythonConfigUtil.setPartitionCustomOperatorNumPartitions( get_field_value(self._j_stream_execution_environment, "transformations")) j_stream_graph = self._j_stream_execution_environment.getStreamGraph( clear_transformations) if job_name is not None: j_stream_graph.setJobName(job_name) return j_stream_graph
def test_set_partitions(self): topic_partition_1 = KafkaTopicPartition('test_topic', 1) topic_partition_2 = KafkaTopicPartition('test_topic', 2) source = KafkaSource.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_partitions({topic_partition_1, topic_partition_2}) \ .set_value_only_deserializer(SimpleStringSchema()) \ .build() kafka_subscriber = get_field_value(source.get_java_function(), 'subscriber') self.assertEqual( kafka_subscriber.getClass().getCanonicalName(), 'org.apache.flink.connector.kafka.source.enumerator.subscriber.PartitionSetSubscriber' ) partitions = get_field_value(kafka_subscriber, 'subscribedPartitions') self.assertTrue( is_instance_of(partitions, get_gateway().jvm.java.util.Set)) self.assertTrue( topic_partition_1._to_j_topic_partition() in partitions) self.assertTrue( topic_partition_2._to_j_topic_partition() in partitions)
def test_for_bounded_out_of_orderness(self): jvm = get_gateway().jvm j_watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness( Duration.of_seconds(3))._j_watermark_strategy j_watermark_generator = j_watermark_strategy.createWatermarkGenerator( None) self.assertTrue( is_instance_of( j_watermark_generator, jvm.org.apache.flink.api.common. eventtime.BoundedOutOfOrdernessWatermarks)) self.assertEqual( get_field_value(j_watermark_generator, "outOfOrdernessMillis"), 3000)
def test_kinesis_source(self): consumer_config = { 'aws.region': 'us-east-1', 'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key', 'flink.stream.initpos': 'LATEST' } kinesis_source = FlinkKinesisConsumer("stream-1", SimpleStringSchema(), consumer_config) ds = self.env.add_source(source_func=kinesis_source, source_name="kinesis source") ds.print() plan = eval(self.env.get_execution_plan()) self.assertEqual('Source: kinesis source', plan['nodes'][0]['type']) self.assertEqual( get_field_value(kinesis_source.get_java_function(), 'streams')[0], 'stream-1')
def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \ -> JavaObject: gateway = get_gateway() JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment) gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply( self._j_stream_execution_environment) JPythonConfigUtil.setPartitionCustomOperatorNumPartitions( get_field_value(self._j_stream_execution_environment, "transformations")) j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations) if job_name is not None: j_stream_graph.setJobName(job_name) return j_stream_graph
def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz): source_topic = 'test_source_topic' sink_topic = 'test_sink_topic' props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props) flink_kafka_consumer.set_start_from_earliest() flink_kafka_consumer.set_commit_offsets_on_checkpoints(True) j_properties = get_field_value(flink_kafka_consumer.get_java_function(), 'properties') self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_properties.getProperty('group.id')) self.assertTrue(get_field_value(flink_kafka_consumer.get_java_function(), 'enableCommitOnCheckpoints')) j_start_up_mode = get_field_value(flink_kafka_consumer.get_java_function(), 'startupMode') j_deserializer = get_field_value(flink_kafka_consumer.get_java_function(), 'deserializer') j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType") deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info) self.assertTrue(deserialize_type_info == type_info) self.assertTrue(j_start_up_mode.equals(get_gateway().jvm .org.apache.flink.streaming.connectors .kafka.config.StartupMode.EARLIEST)) j_topic_desc = get_field_value(flink_kafka_consumer.get_java_function(), 'topicsDescriptor') j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics') self.assertEqual(['test_source_topic'], list(j_topics)) # Test for kafka producer serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props) flink_kafka_producer.set_write_timestamp_to_kafka(False) j_producer_config = get_field_value(flink_kafka_producer.get_java_function(), 'producerConfig') self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_producer_config.getProperty('group.id')) self.assertFalse(get_field_value(flink_kafka_producer.get_java_function(), 'writeTimestampToKafka'))
def test_source_deprecated_method(self): test_option = ConfigOptions.key('pulsar.source.enableAutoAcknowledgeMessage') \ .boolean_type().no_default_value() pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics('ada') \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING(), None)) \ .set_subscription_name('ff') \ .set_config(test_option, True) \ .set_config_with_dict({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval'). long_type().no_default_value()._j_config_option), 1000)
def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \ -> JavaObject: gateway = get_gateway() JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster j_configuration = get_j_env_configuration(self._j_stream_execution_environment) if not self._remote_mode and is_local_deployment(j_configuration): from pyflink.common import Configuration from pyflink.fn_execution.beam.beam_worker_pool_service import \ BeamFnLoopbackWorkerPoolServicer jvm = gateway.jvm env_config = JPythonConfigUtil.getEnvironmentConfig( self._j_stream_execution_environment) parallelism = self.get_parallelism() if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()): import logging logging.warning("Lookback mode is disabled as python archives are used and the " "parallelism of the job is greater than 1. The Python user-defined " "functions will be executed in an independent Python process.") else: config = Configuration(j_configuration=j_configuration) config.set_string( "loopback.server.address", BeamFnLoopbackWorkerPoolServicer().start()) JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment) gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply( self._j_stream_execution_environment) JPythonConfigUtil.setPartitionCustomOperatorNumPartitions( get_field_value(self._j_stream_execution_environment, "transformations")) j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations) if job_name is not None: j_stream_graph.setJobName(job_name) return j_stream_graph
def __str__(self): if self._schema_string is None: self._schema_string = get_field_value(self._j_schema, 'schema').toString() return self._schema_string