Exemple #1
0
    def _check_specified_offsets_initializer(
            self,
            source: KafkaSource,
            offsets: Dict[KafkaTopicPartition, int],
            reset_strategy: KafkaOffsetResetStrategy,
            is_start: bool = True):
        if is_start:
            field_name = 'startingOffsetsInitializer'
        else:
            field_name = 'stoppingOffsetsInitializer'
        offsets_initializer = get_field_value(source.get_java_function(),
                                              field_name)
        self.assertEqual(
            offsets_initializer.getClass().getCanonicalName(),
            'org.apache.flink.connector.kafka.source.enumerator.initializer'
            '.SpecifiedOffsetsInitializer')

        initial_offsets = get_field_value(offsets_initializer,
                                          'initialOffsets')
        self.assertTrue(
            is_instance_of(initial_offsets,
                           get_gateway().jvm.java.util.Map))
        self.assertEqual(initial_offsets.size(), len(offsets))
        for j_topic_partition in initial_offsets:
            topic_partition = KafkaTopicPartition(
                j_topic_partition.topic(), j_topic_partition.partition())
            self.assertIsNotNone(offsets.get(topic_partition))
            self.assertEqual(initial_offsets[j_topic_partition],
                             offsets[topic_partition])

        offset_reset_strategy = get_field_value(offsets_initializer,
                                                'offsetResetStrategy')
        self.assertTrue(
            offset_reset_strategy.equals(
                reset_strategy._to_j_offset_reset_strategy()))
Exemple #2
0
    def _check_reader_handled_offsets_initializer(
            self,
            source: KafkaSource,
            offset: int,
            reset_strategy: KafkaOffsetResetStrategy,
            is_start: bool = True):
        if is_start:
            field_name = 'startingOffsetsInitializer'
        else:
            field_name = 'stoppingOffsetsInitializer'
        offsets_initializer = get_field_value(source.get_java_function(),
                                              field_name)
        self.assertEqual(
            offsets_initializer.getClass().getCanonicalName(),
            'org.apache.flink.connector.kafka.source.enumerator.initializer'
            '.ReaderHandledOffsetsInitializer')

        starting_offset = get_field_value(offsets_initializer,
                                          'startingOffset')
        self.assertEqual(starting_offset, offset)

        offset_reset_strategy = get_field_value(offsets_initializer,
                                                'offsetResetStrategy')
        self.assertTrue(
            offset_reset_strategy.equals(
                reset_strategy._to_j_offset_reset_strategy()))
Exemple #3
0
    def test_rabbitmq_connectors(self):
        connection_config = RMQConnectionConfig.Builder() \
            .set_host('localhost') \
            .set_port(5672) \
            .set_virtual_host('/') \
            .set_user_name('guest') \
            .set_password('guest') \
            .build()
        type_info = Types.ROW([Types.INT(), Types.STRING()])
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        rmq_source = RMQSource(connection_config, 'source_queue', True,
                               deserialization_schema)
        self.assertEqual(
            get_field_value(rmq_source.get_java_function(), 'queueName'),
            'source_queue')
        self.assertTrue(
            get_field_value(rmq_source.get_java_function(),
                            'usesCorrelationId'))

        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        rmq_sink = RMQSink(connection_config, 'sink_queue',
                           serialization_schema)
        self.assertEqual(
            get_field_value(rmq_sink.get_java_function(), 'queueName'),
            'sink_queue')
Exemple #4
0
    def test_set_delivery_guarantee(self):
        sink = KafkaSink.builder() \
            .set_bootstrap_servers('localhost:9092') \
            .set_record_serializer(self._build_serialization_schema()) \
            .build()
        guarantee = get_field_value(sink.get_java_function(),
                                    'deliveryGuarantee')
        self.assertEqual(guarantee.toString(), 'none')

        sink = KafkaSink.builder() \
            .set_bootstrap_servers('localhost:9092') \
            .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
            .set_record_serializer(self._build_serialization_schema()) \
            .build()
        guarantee = get_field_value(sink.get_java_function(),
                                    'deliveryGuarantee')
        self.assertEqual(guarantee.toString(), 'at-least-once')

        sink = KafkaSink.builder() \
            .set_bootstrap_servers('localhost:9092') \
            .set_delivery_guarantee(DeliveryGuarantee.EXACTLY_ONCE) \
            .set_record_serializer(self._build_serialization_schema()) \
            .build()
        guarantee = get_field_value(sink.get_java_function(),
                                    'deliveryGuarantee')
        self.assertEqual(guarantee.toString(), 'exactly-once')
Exemple #5
0
    def test_kinesis_firehose_sink(self):
        _load_specific_flink_module_jars('/flink-connectors/'
                                         'flink-sql-connector-aws-kinesis-firehose')

        sink_properties = {
            'aws.region': 'eu-west-1',
            'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key'
        }

        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=Types.ROW([Types.STRING(), Types.INT()]))

        kinesis_firehose_sink = KinesisFirehoseSink.builder() \
            .set_firehose_client_properties(sink_properties) \
            .set_serialization_schema(SimpleStringSchema()) \
            .set_delivery_stream_name('stream-1') \
            .set_fail_on_error(False) \
            .set_max_batch_size(500) \
            .set_max_in_flight_requests(50) \
            .set_max_buffered_requests(10000) \
            .set_max_batch_size_in_bytes(5 * 1024 * 1024) \
            .set_max_time_in_buffer_ms(5000) \
            .set_max_record_size_in_bytes(1 * 1024 * 1024) \
            .build()

        ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink')
        plan = eval(self.env.get_execution_plan())

        self.assertEqual('kinesis firehose sink: Writer', plan['nodes'][1]['type'])
        self.assertEqual(get_field_value(kinesis_firehose_sink.get_java_function(), 'failOnError'),
                         False)
        self.assertEqual(
            get_field_value(kinesis_firehose_sink.get_java_function(), 'deliveryStreamName'),
            'stream-1')
Exemple #6
0
    def test_kinesis_streams_sink(self):
        sink_properties = {
            'aws.region': 'us-east-1',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key'
        }

        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)],
                                      type_info=Types.ROW([Types.STRING(), Types.INT()]))

        kinesis_streams_sink = KinesisStreamsSink.builder() \
            .set_kinesis_client_properties(sink_properties) \
            .set_serialization_schema(SimpleStringSchema()) \
            .set_partition_key_generator(PartitionKeyGenerator.fixed()) \
            .set_stream_name("stream-1") \
            .set_fail_on_error(False) \
            .set_max_batch_size(500) \
            .set_max_in_flight_requests(50) \
            .set_max_buffered_requests(10000) \
            .set_max_batch_size_in_bytes(5 * 1024 * 1024) \
            .set_max_time_in_buffer_ms(5000) \
            .set_max_record_size_in_bytes(1 * 1024 * 1024) \
            .build()

        ds.sink_to(kinesis_streams_sink).name('kinesis streams sink')
        plan = eval(self.env.get_execution_plan())

        self.assertEqual('kinesis streams sink: Writer', plan['nodes'][1]['type'])
        self.assertEqual(get_field_value(kinesis_streams_sink.get_java_function(), 'failOnError'),
                         False)
        self.assertEqual(
            get_field_value(kinesis_streams_sink.get_java_function(), 'streamName'), 'stream-1')
 def startup_loopback_server():
     from pyflink.fn_execution.beam.beam_worker_pool_service import \
         BeamFnLoopbackWorkerPoolServicer
     jvm = gateway.jvm
     j_env = jvm.System.getenv()
     get_field_value(j_env, "m").put(
         'PYFLINK_LOOPBACK_SERVER_ADDRESS',
         BeamFnLoopbackWorkerPoolServicer().start())
Exemple #8
0
    def test_es_sink_dynamic(self):
        ds = self.env.from_collection([{
            'name': 'ada',
            'id': '1'
        }, {
            'name': 'luna',
            'id': '2'
        }],
                                      type_info=Types.MAP(
                                          Types.STRING(), Types.STRING()))

        es_dynamic_index_sink = Elasticsearch7SinkBuilder() \
            .set_emitter(ElasticsearchEmitter.dynamic_index('name', 'id')) \
            .set_hosts(['localhost:9200']) \
            .build()

        j_emitter = get_field_value(es_dynamic_index_sink.get_java_function(),
                                    'emitter')
        self.assertTrue(
            is_instance_of(
                j_emitter,
                'org.apache.flink.connector.elasticsearch.sink.SimpleElasticsearchEmitter'
            ))

        ds.sink_to(es_dynamic_index_sink).name('es dynamic index sink')
 def startup_loopback_server():
     jvm = gateway.jvm
     env_config = JPythonConfigUtil.getEnvironmentConfig(
         self._j_stream_execution_environment)
     parallelism = self.get_parallelism()
     if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()):
         import logging
         logging.warning("Loopback mode is disabled as python archives are used and the "
                         "parallelism of the job is greater than 1. The Python user-defined "
                         "functions will be executed in an independent Python process.")
     else:
         from pyflink.fn_execution.beam.beam_worker_pool_service import \
             BeamFnLoopbackWorkerPoolServicer
         j_env = jvm.System.getenv()
         get_field_value(j_env, "m").put(
             'PYFLINK_LOOPBACK_SERVER_ADDRESS', BeamFnLoopbackWorkerPoolServicer().start())
Exemple #10
0
 def test_set_topic_pattern(self):
     source = KafkaSource.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_topic_pattern('test_topic*') \
         .set_value_only_deserializer(SimpleStringSchema()) \
         .build()
     kafka_subscriber = get_field_value(source.get_java_function(),
                                        'subscriber')
     self.assertEqual(
         kafka_subscriber.getClass().getCanonicalName(),
         'org.apache.flink.connector.kafka.source.enumerator.subscriber.TopicPatternSubscriber'
     )
     topic_pattern = get_field_value(kafka_subscriber, 'topicPattern')
     self.assertTrue(
         is_instance_of(topic_pattern,
                        get_gateway().jvm.java.util.regex.Pattern))
     self.assertEqual(topic_pattern.toString(), 'test_topic*')
Exemple #11
0
 def _check(schema: DeserializationSchema, class_name: str):
     source = KafkaSource.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_topics('test_topic') \
         .set_value_only_deserializer(schema) \
         .build()
     deserialization_schema_wrapper = get_field_value(
         source.get_java_function(), 'deserializationSchema')
     self.assertEqual(
         deserialization_schema_wrapper.getClass().getCanonicalName(),
         'org.apache.flink.connector.kafka.source.reader.deserializer'
         '.KafkaValueOnlyDeserializationSchemaWrapper')
     deserialization_schema = get_field_value(
         deserialization_schema_wrapper, 'deserializationSchema')
     self.assertEqual(
         deserialization_schema.getClass().getCanonicalName(),
         class_name)
Exemple #12
0
    def test_jdbc_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))
        jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\
            .with_driver_name('com.mysql.jdbc.Driver')\
            .with_user_name('root')\
            .with_password('password')\
            .with_url('jdbc:mysql://server-name:server-port/database-name').build()

        jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\
            .with_batch_size(100).with_max_retries(5).build()
        jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(),
                                  jdbc_connection_options,
                                  jdbc_execution_options)

        ds.add_sink(jdbc_sink).name('jdbc sink')
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type'])
        j_output_format = get_field_value(jdbc_sink.get_java_function(),
                                          'outputFormat')

        connection_options = JdbcConnectionOptions(
            get_field_value(
                get_field_value(j_output_format, 'connectionProvider'),
                'jdbcOptions'))
        self.assertEqual(jdbc_connection_options.get_db_url(),
                         connection_options.get_db_url())
        self.assertEqual(jdbc_connection_options.get_driver_name(),
                         connection_options.get_driver_name())
        self.assertEqual(jdbc_connection_options.get_password(),
                         connection_options.get_password())
        self.assertEqual(jdbc_connection_options.get_user_name(),
                         connection_options.get_user_name())

        exec_options = JdbcExecutionOptions(
            get_field_value(j_output_format, 'executionOptions'))
        self.assertEqual(jdbc_execution_options.get_batch_interval_ms(),
                         exec_options.get_batch_interval_ms())
        self.assertEqual(jdbc_execution_options.get_batch_size(),
                         exec_options.get_batch_size())
        self.assertEqual(jdbc_execution_options.get_max_retries(),
                         exec_options.get_max_retries())
Exemple #13
0
    def _check_timestamp_offsets_initializer(self,
                                             source: KafkaSource,
                                             timestamp: int,
                                             is_start: bool = True):
        if is_start:
            field_name = 'startingOffsetsInitializer'
        else:
            field_name = 'stoppingOffsetsInitializer'
        offsets_initializer = get_field_value(source.get_java_function(),
                                              field_name)
        self.assertEqual(
            offsets_initializer.getClass().getCanonicalName(),
            'org.apache.flink.connector.kafka.source.enumerator.initializer'
            '.TimestampOffsetsInitializer')

        starting_timestamp = get_field_value(offsets_initializer,
                                             'startingTimestamp')
        self.assertEqual(starting_timestamp, timestamp)
Exemple #14
0
 def test_set_transactional_id_prefix(self):
     sink = KafkaSink.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_transactional_id_prefix('test-prefix') \
         .set_record_serializer(self._build_serialization_schema()) \
         .build()
     prefix = get_field_value(sink.get_java_function(),
                              'transactionalIdPrefix')
     self.assertEqual(prefix, 'test-prefix')
Exemple #15
0
 def test_set_property(self):
     sink = KafkaSink.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_record_serializer(self._build_serialization_schema()) \
         .set_property('test-key', 'test-value') \
         .build()
     config = get_field_value(sink.get_java_function(),
                              'kafkaProducerConfig')
     self.assertEqual(config.get('test-key'), 'test-value')
Exemple #16
0
    def test_pulsar_source(self):
        TEST_OPTION_NAME = 'pulsar.source.enableAutoAcknowledgeMessage'
        pulsar_source = PulsarSource.builder() \
            .set_service_url('pulsar://localhost:6650') \
            .set_admin_url('http://localhost:8080') \
            .set_topics('ada') \
            .set_start_cursor(StartCursor.earliest()) \
            .set_unbounded_stop_cursor(StopCursor.never()) \
            .set_bounded_stop_cursor(StopCursor.at_publish_time(22)) \
            .set_subscription_name('ff') \
            .set_subscription_type(SubscriptionType.Exclusive) \
            .set_deserialization_schema(
                PulsarDeserializationSchema.flink_type_info(Types.STRING())) \
            .set_deserialization_schema(
                PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \
            .set_config(TEST_OPTION_NAME, True) \
            .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \
            .build()

        ds = self.env.from_source(source=pulsar_source,
                                  watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
                                  source_name="pulsar source")
        ds.print()
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Source: pulsar source', plan['nodes'][0]['type'])

        configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration")
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.client.serviceUrl')
                .string_type()
                .no_default_value()._j_config_option), 'pulsar://localhost:6650')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.admin.adminUrl')
                .string_type()
                .no_default_value()._j_config_option), 'http://localhost:8080')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.consumer.subscriptionName')
                .string_type()
                .no_default_value()._j_config_option), 'ff')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.consumer.subscriptionType')
                .string_type()
                .no_default_value()._j_config_option), SubscriptionType.Exclusive.name)
        test_option = ConfigOptions.key(TEST_OPTION_NAME).boolean_type().no_default_value()
        self.assertEqual(
            configuration.getBoolean(
                test_option._j_config_option), True)
        self.assertEqual(
            configuration.getLong(
                ConfigOptions.key('pulsar.source.autoCommitCursorInterval')
                .long_type()
                .no_default_value()._j_config_option), 1000)
Exemple #17
0
 def test_set_topics(self):
     source = KafkaSource.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_topics('test_topic1', 'test_topic2') \
         .set_value_only_deserializer(SimpleStringSchema()) \
         .build()
     kafka_subscriber = get_field_value(source.get_java_function(),
                                        'subscriber')
     self.assertEqual(
         kafka_subscriber.getClass().getCanonicalName(),
         'org.apache.flink.connector.kafka.source.enumerator.subscriber.TopicListSubscriber'
     )
     topics = get_field_value(kafka_subscriber, 'topics')
     self.assertTrue(
         is_instance_of(topics,
                        get_gateway().jvm.java.util.List))
     self.assertEqual(topics.size(), 2)
     self.assertEqual(topics[0], 'test_topic1')
     self.assertEqual(topics[1], 'test_topic2')
Exemple #18
0
 def test_set_record_serializer(self):
     sink = KafkaSink.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_record_serializer(self._build_serialization_schema()) \
         .build()
     serializer = get_field_value(sink.get_java_function(),
                                  'recordSerializer')
     self.assertEqual(
         serializer.getClass().getCanonicalName(),
         'org.apache.flink.connector.kafka.sink.'
         'KafkaRecordSerializationSchemaBuilder.'
         'KafkaRecordSerializationSchemaWrapper')
     topic_selector = get_field_value(serializer, 'topicSelector')
     self.assertEqual(topic_selector.apply(None), 'test-topic')
     value_serializer = get_field_value(serializer,
                                        'valueSerializationSchema')
     self.assertEqual(
         value_serializer.getClass().getCanonicalName(),
         'org.apache.flink.api.common.serialization.SimpleStringSchema')
    def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \
            -> JavaObject:
        gateway = get_gateway()
        JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster
        j_configuration = get_j_env_configuration(self._j_stream_execution_environment)

        def startup_loopback_server():
            jvm = gateway.jvm
            env_config = JPythonConfigUtil.getEnvironmentConfig(
                self._j_stream_execution_environment)
            parallelism = self.get_parallelism()
            if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()):
                import logging
                logging.warning("Loopback mode is disabled as python archives are used and the "
                                "parallelism of the job is greater than 1. The Python user-defined "
                                "functions will be executed in an independent Python process.")
            else:
                from pyflink.fn_execution.beam.beam_worker_pool_service import \
                    BeamFnLoopbackWorkerPoolServicer
                j_env = jvm.System.getenv()
                get_field_value(j_env, "m").put(
                    'PYFLINK_LOOPBACK_SERVER_ADDRESS', BeamFnLoopbackWorkerPoolServicer().start())

        python_worker_execution_mode = None
        if hasattr(self, "_python_worker_execution_mode"):
            python_worker_execution_mode = getattr(self, "_python_worker_execution_mode")

        if python_worker_execution_mode is None:
            if is_local_deployment(j_configuration):
                startup_loopback_server()
        elif python_worker_execution_mode == 'loopback':
            if is_local_deployment(j_configuration):
                startup_loopback_server()
            else:
                raise ValueError("Loopback mode is enabled, however the job wasn't configured to "
                                 "run in local deployment mode")
        elif python_worker_execution_mode != 'process':
            raise ValueError(
                "It only supports to execute the Python worker in 'loopback' mode and 'process' "
                "mode, unknown mode '%s' is configured" % python_worker_execution_mode)

        JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment)

        gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply(
            self._j_stream_execution_environment)

        JPythonConfigUtil.setPartitionCustomOperatorNumPartitions(
            get_field_value(self._j_stream_execution_environment, "transformations"))

        j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations)
        if job_name is not None:
            j_stream_graph.setJobName(job_name)
        return j_stream_graph
Exemple #20
0
 def test_with_idleness(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.no_watermarks().with_idleness(
         Duration.of_seconds(5))._j_watermark_strategy
     self.assertTrue(
         is_instance_of(
             j_watermark_strategy, jvm.org.apache.flink.api.common.
             eventtime.WatermarkStrategyWithIdleness))
     self.assertEqual(
         get_field_value(j_watermark_strategy,
                         "idlenessTimeout").toMillis(), 5000)
Exemple #21
0
 def _check_value_serialization_schema(value_serialization_schema,
                                       expected_class):
     serialization_schema = KafkaRecordSerializationSchema.builder() \
         .set_topic('test-topic') \
         .set_value_serialization_schema(value_serialization_schema) \
         .build()
     schema_field = get_field_value(
         serialization_schema._j_serialization_schema,
         'valueSerializationSchema')
     self.assertIsNotNone(schema_field)
     self.assertEqual(schema_field.getClass().getCanonicalName(),
                      expected_class)
    def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \
            -> JavaObject:
        gateway = get_gateway()
        JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster
        j_configuration = get_j_env_configuration(
            self._j_stream_execution_environment)

        def startup_loopback_server():
            from pyflink.fn_execution.beam.beam_worker_pool_service import \
                BeamFnLoopbackWorkerPoolServicer
            jvm = gateway.jvm
            j_env = jvm.System.getenv()
            get_field_value(j_env, "m").put(
                'PYFLINK_LOOPBACK_SERVER_ADDRESS',
                BeamFnLoopbackWorkerPoolServicer().start())

        python_worker_execution_mode = None
        if hasattr(self, "_python_worker_execution_mode"):
            python_worker_execution_mode = getattr(
                self, "_python_worker_execution_mode")

        if python_worker_execution_mode is None:
            if is_local_deployment(j_configuration):
                startup_loopback_server()
        elif python_worker_execution_mode == 'loopback':
            if is_local_deployment(j_configuration):
                startup_loopback_server()
            else:
                raise ValueError(
                    "Loopback mode is enabled, however the job wasn't configured to "
                    "run in local deployment mode")
        elif python_worker_execution_mode != 'process':
            raise ValueError(
                "It only supports to execute the Python worker in 'loopback' mode and 'process' "
                "mode, unknown mode '%s' is configured" %
                python_worker_execution_mode)

        JPythonConfigUtil.configPythonOperator(
            self._j_stream_execution_environment)

        gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply(
            self._j_stream_execution_environment)

        JPythonConfigUtil.setPartitionCustomOperatorNumPartitions(
            get_field_value(self._j_stream_execution_environment,
                            "transformations"))

        j_stream_graph = self._j_stream_execution_environment.getStreamGraph(
            clear_transformations)
        if job_name is not None:
            j_stream_graph.setJobName(job_name)
        return j_stream_graph
Exemple #23
0
 def test_set_partitions(self):
     topic_partition_1 = KafkaTopicPartition('test_topic', 1)
     topic_partition_2 = KafkaTopicPartition('test_topic', 2)
     source = KafkaSource.builder() \
         .set_bootstrap_servers('localhost:9092') \
         .set_partitions({topic_partition_1, topic_partition_2}) \
         .set_value_only_deserializer(SimpleStringSchema()) \
         .build()
     kafka_subscriber = get_field_value(source.get_java_function(),
                                        'subscriber')
     self.assertEqual(
         kafka_subscriber.getClass().getCanonicalName(),
         'org.apache.flink.connector.kafka.source.enumerator.subscriber.PartitionSetSubscriber'
     )
     partitions = get_field_value(kafka_subscriber, 'subscribedPartitions')
     self.assertTrue(
         is_instance_of(partitions,
                        get_gateway().jvm.java.util.Set))
     self.assertTrue(
         topic_partition_1._to_j_topic_partition() in partitions)
     self.assertTrue(
         topic_partition_2._to_j_topic_partition() in partitions)
Exemple #24
0
 def test_for_bounded_out_of_orderness(self):
     jvm = get_gateway().jvm
     j_watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
         Duration.of_seconds(3))._j_watermark_strategy
     j_watermark_generator = j_watermark_strategy.createWatermarkGenerator(
         None)
     self.assertTrue(
         is_instance_of(
             j_watermark_generator, jvm.org.apache.flink.api.common.
             eventtime.BoundedOutOfOrdernessWatermarks))
     self.assertEqual(
         get_field_value(j_watermark_generator, "outOfOrdernessMillis"),
         3000)
Exemple #25
0
    def test_kinesis_source(self):
        consumer_config = {
            'aws.region': 'us-east-1',
            'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key',
            'flink.stream.initpos': 'LATEST'
        }

        kinesis_source = FlinkKinesisConsumer("stream-1", SimpleStringSchema(), consumer_config)

        ds = self.env.add_source(source_func=kinesis_source, source_name="kinesis source")
        ds.print()
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Source: kinesis source', plan['nodes'][0]['type'])
        self.assertEqual(
            get_field_value(kinesis_source.get_java_function(), 'streams')[0], 'stream-1')
    def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \
            -> JavaObject:
        gateway = get_gateway()
        JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil

        JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment)

        gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply(
            self._j_stream_execution_environment)

        JPythonConfigUtil.setPartitionCustomOperatorNumPartitions(
            get_field_value(self._j_stream_execution_environment, "transformations"))

        j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations)
        if job_name is not None:
            j_stream_graph.setJobName(job_name)
        return j_stream_graph
Exemple #27
0
    def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz):
        source_topic = 'test_source_topic'
        sink_topic = 'test_sink_topic'
        props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
        type_info = Types.ROW([Types.INT(), Types.STRING()])

        # Test for kafka consumer
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props)
        flink_kafka_consumer.set_start_from_earliest()
        flink_kafka_consumer.set_commit_offsets_on_checkpoints(True)

        j_properties = get_field_value(flink_kafka_consumer.get_java_function(), 'properties')
        self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_properties.getProperty('group.id'))
        self.assertTrue(get_field_value(flink_kafka_consumer.get_java_function(),
                                        'enableCommitOnCheckpoints'))
        j_start_up_mode = get_field_value(flink_kafka_consumer.get_java_function(), 'startupMode')

        j_deserializer = get_field_value(flink_kafka_consumer.get_java_function(), 'deserializer')
        j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType")
        deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info)
        self.assertTrue(deserialize_type_info == type_info)
        self.assertTrue(j_start_up_mode.equals(get_gateway().jvm
                                               .org.apache.flink.streaming.connectors
                                               .kafka.config.StartupMode.EARLIEST))
        j_topic_desc = get_field_value(flink_kafka_consumer.get_java_function(),
                                       'topicsDescriptor')
        j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics')
        self.assertEqual(['test_source_topic'], list(j_topics))

        # Test for kafka producer
        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props)
        flink_kafka_producer.set_write_timestamp_to_kafka(False)

        j_producer_config = get_field_value(flink_kafka_producer.get_java_function(),
                                            'producerConfig')
        self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_producer_config.getProperty('group.id'))
        self.assertFalse(get_field_value(flink_kafka_producer.get_java_function(),
                                         'writeTimestampToKafka'))
Exemple #28
0
 def test_source_deprecated_method(self):
     test_option = ConfigOptions.key('pulsar.source.enableAutoAcknowledgeMessage') \
         .boolean_type().no_default_value()
     pulsar_source = PulsarSource.builder() \
         .set_service_url('pulsar://localhost:6650') \
         .set_admin_url('http://localhost:8080') \
         .set_topics('ada') \
         .set_deserialization_schema(
             PulsarDeserializationSchema.flink_type_info(Types.STRING(), None)) \
         .set_subscription_name('ff') \
         .set_config(test_option, True) \
         .set_config_with_dict({'pulsar.source.autoCommitCursorInterval': '1000'}) \
         .build()
     configuration = get_field_value(pulsar_source.get_java_function(),
                                     "sourceConfiguration")
     self.assertEqual(
         configuration.getBoolean(test_option._j_config_option), True)
     self.assertEqual(
         configuration.getLong(
             ConfigOptions.key('pulsar.source.autoCommitCursorInterval').
             long_type().no_default_value()._j_config_option), 1000)
Exemple #29
0
    def _generate_stream_graph(self, clear_transformations: bool = False, job_name: str = None) \
            -> JavaObject:
        gateway = get_gateway()
        JPythonConfigUtil = gateway.jvm.org.apache.flink.python.util.PythonConfigUtil
        # start BeamFnLoopbackWorkerPoolServicer when executed in MiniCluster
        j_configuration = get_j_env_configuration(self._j_stream_execution_environment)
        if not self._remote_mode and is_local_deployment(j_configuration):
            from pyflink.common import Configuration
            from pyflink.fn_execution.beam.beam_worker_pool_service import \
                BeamFnLoopbackWorkerPoolServicer

            jvm = gateway.jvm
            env_config = JPythonConfigUtil.getEnvironmentConfig(
                self._j_stream_execution_environment)
            parallelism = self.get_parallelism()
            if parallelism > 1 and env_config.containsKey(jvm.PythonOptions.PYTHON_ARCHIVES.key()):
                import logging
                logging.warning("Lookback mode is disabled as python archives are used and the "
                                "parallelism of the job is greater than 1. The Python user-defined "
                                "functions will be executed in an independent Python process.")
            else:
                config = Configuration(j_configuration=j_configuration)
                config.set_string(
                    "loopback.server.address", BeamFnLoopbackWorkerPoolServicer().start())

        JPythonConfigUtil.configPythonOperator(self._j_stream_execution_environment)

        gateway.jvm.org.apache.flink.python.chain.PythonOperatorChainingOptimizer.apply(
            self._j_stream_execution_environment)

        JPythonConfigUtil.setPartitionCustomOperatorNumPartitions(
            get_field_value(self._j_stream_execution_environment, "transformations"))

        j_stream_graph = self._j_stream_execution_environment.getStreamGraph(clear_transformations)
        if job_name is not None:
            j_stream_graph.setJobName(job_name)
        return j_stream_graph
Exemple #30
0
 def __str__(self):
     if self._schema_string is None:
         self._schema_string = get_field_value(self._j_schema,
                                               'schema').toString()
     return self._schema_string