Beispiel #1
0
    def test_execute_and_collect(self):
        test_data = ['pyflink', 'datastream', 'execute', 'collect']
        ds = self.env.from_collection(test_data)

        expected = test_data[:3]
        actual = []
        for result in ds.execute_and_collect(limit=3):
            actual.append(result)
        self.assertEqual(expected, actual)

        expected = test_data
        ds = self.env.from_collection(collection=test_data,
                                      type_info=Types.STRING())
        with ds.execute_and_collect() as results:
            actual = []
            for result in results:
                actual.append(result)
            self.assertEqual(expected, actual)

        test_data = [
            (1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
             bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13),
             datetime.time(hour=12, minute=0, second=0, microsecond=123000),
             datetime.datetime(2018, 3, 11, 3, 0, 0,
                               123000), [1, 2, 3], [['pyflink', 'datastream'],
                                                    ['execute', 'collect']],
             decimal.Decimal('1000000000000000000.05'),
             decimal.Decimal('1000000000000000000.0599999999999'
                             '9999899999999999')),
            (2, None, 2, True, 23878, 652516352, 9.87, 2.98936,
             bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14),
             datetime.time(hour=11, minute=2, second=2, microsecond=234500),
             datetime.datetime(2020, 4, 15, 8, 2, 6,
                               235000), [2, 4, 6], [['pyflink', 'datastream'],
                                                    ['execute', 'collect']],
             decimal.Decimal('2000000000000000000.74'),
             decimal.Decimal('2000000000000000000.061111111111111'
                             '11111111111111'))
        ]
        expected = test_data
        ds = self.env.from_collection(test_data)
        with ds.execute_and_collect() as results:
            actual = []
            for result in results:
                actual.append(result)
            self.assertEqual(expected, actual)
Beispiel #2
0
    def test_jdbc_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))
        jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\
            .with_driver_name('com.mysql.jdbc.Driver')\
            .with_user_name('root')\
            .with_password('password')\
            .with_url('jdbc:mysql://server-name:server-port/database-name').build()

        jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\
            .with_batch_size(100).with_max_retries(5).build()
        jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(),
                                  jdbc_connection_options,
                                  jdbc_execution_options)

        ds.add_sink(jdbc_sink).name('jdbc sink')
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type'])
        j_output_format = get_field_value(jdbc_sink.get_java_function(),
                                          'outputFormat')

        connection_options = JdbcConnectionOptions(
            get_field_value(
                get_field_value(j_output_format, 'connectionProvider'),
                'jdbcOptions'))
        self.assertEqual(jdbc_connection_options.get_db_url(),
                         connection_options.get_db_url())
        self.assertEqual(jdbc_connection_options.get_driver_name(),
                         connection_options.get_driver_name())
        self.assertEqual(jdbc_connection_options.get_password(),
                         connection_options.get_password())
        self.assertEqual(jdbc_connection_options.get_user_name(),
                         connection_options.get_user_name())

        exec_options = JdbcExecutionOptions(
            get_field_value(j_output_format, 'executionOptions'))
        self.assertEqual(jdbc_execution_options.get_batch_interval_ms(),
                         exec_options.get_batch_interval_ms())
        self.assertEqual(jdbc_execution_options.get_batch_size(),
                         exec_options.get_batch_size())
        self.assertEqual(jdbc_execution_options.get_max_retries(),
                         exec_options.get_max_retries())
Beispiel #3
0
    def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz):
        source_topic = 'test_source_topic'
        sink_topic = 'test_sink_topic'
        props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
        type_info = Types.ROW([Types.INT(), Types.STRING()])

        # Test for kafka consumer
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props)
        flink_kafka_consumer.set_start_from_earliest()
        flink_kafka_consumer.set_commit_offsets_on_checkpoints(True)

        j_properties = get_private_field(flink_kafka_consumer.get_java_function(), 'properties')
        self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_properties.getProperty('group.id'))
        self.assertTrue(get_private_field(flink_kafka_consumer.get_java_function(),
                                          'enableCommitOnCheckpoints'))
        j_start_up_mode = get_private_field(flink_kafka_consumer.get_java_function(), 'startupMode')

        j_deserializer = get_private_field(flink_kafka_consumer.get_java_function(), 'deserializer')
        j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType")
        deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info)
        self.assertTrue(deserialize_type_info == type_info)
        self.assertTrue(j_start_up_mode.equals(get_gateway().jvm
                                               .org.apache.flink.streaming.connectors
                                               .kafka.config.StartupMode.EARLIEST))
        j_topic_desc = get_private_field(flink_kafka_consumer.get_java_function(),
                                         'topicsDescriptor')
        j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics')
        self.assertEqual(['test_source_topic'], list(j_topics))

        # Test for kafka producer
        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props)
        flink_kafka_producer.set_write_timestamp_to_kafka(False)

        j_producer_config = get_private_field(flink_kafka_producer.get_java_function(),
                                              'producerConfig')
        self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_producer_config.getProperty('group.id'))
        self.assertFalse(get_private_field(flink_kafka_producer.get_java_function(),
                                           'writeTimestampToKafka'))
Beispiel #4
0
    def test_tuple_type(self):
        self.assertEqual(TupleTypeInfo([Types.STRING(),
                                        Types.INT()]),
                         TupleTypeInfo([Types.STRING(),
                                        Types.INT()]), True)

        self.assertEqual(
            TupleTypeInfo([Types.STRING(), Types.INT()]).__str__(),
            "TupleTypeInfo(String, Integer)")

        self.assertNotEqual(TupleTypeInfo([Types.STRING(),
                                           Types.INT()]),
                            TupleTypeInfo([Types.STRING(),
                                           Types.BOOLEAN()]))

        self.assertEqual(Types.TUPLE([Types.STRING(),
                                      Types.INT()]),
                         TupleTypeInfo([Types.STRING(),
                                        Types.INT()]))

        self.assertEqual(
            Types.TUPLE([Types.STRING(), Types.INT()]).get_field_types(),
            [Types.STRING(), Types.INT()])
Beispiel #5
0
    def _align_output_type(self) -> 'DataStream':
        """
        Transform the pickled python object into String if the output type is PickledByteArrayInfo.
        """
        output_type_info_class = self._j_data_stream.getTransformation().getOutputType().getClass()
        if output_type_info_class.isAssignableFrom(
                PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO().get_java_type_info()
                .getClass()):
            def python_obj_to_str_map_func(value):
                if not isinstance(value, (str, bytes)):
                    value = str(value)
                return value

            transformed_data_stream = DataStream(self.map(python_obj_to_str_map_func,
                                                          type_info=Types.STRING())._j_data_stream)
            return transformed_data_stream
        else:
            return self
Beispiel #6
0
 def test_map_function_without_data_types(self):
     self.env.set_parallelism(1)
     ds = self.env.from_collection([('ab', decimal.Decimal(1)),
                                    ('bdc', decimal.Decimal(2)),
                                    ('cfgs', decimal.Decimal(3)),
                                    ('deeefg', decimal.Decimal(4))],
                                   type_info=Types.ROW(
                                       [Types.STRING(),
                                        Types.BIG_DEC()]))
     ds.map(MyMapFunction()).add_sink(self.test_sink)
     self.env.execute('map_function_test')
     results = self.test_sink.get_results(True)
     expected = [
         "('ab', 2, Decimal('1'))", "('bdc', 3, Decimal('2'))",
         "('cfgs', 4, Decimal('3'))", "('deeefg', 6, Decimal('4'))"
     ]
     expected.sort()
     results.sort()
     self.assertEqual(expected, results)
    def test_csv_row_serialization_schema(self):
        JRow = get_gateway().jvm.org.apache.flink.types.Row

        j_row = JRow(3)
        j_row.setField(0, "BEGIN")
        j_row.setField(2, "END")

        def field_assertion(field_info, csv_value, value, field_delimiter):
            row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()])
            expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n"
            j_row.setField(1, value)

            csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()
            csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()

            serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize(
                j_row)
            self.assertEqual(expected_csv,
                             str(serialized_bytes, encoding='utf-8'))

            j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\
                .deserialize(expected_csv.encode("utf-8"))
            self.assertTrue(j_row.equals(j_deserialized_row))

        field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";")
        field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";")
        field_assertion(Types.INT(), "12", 12, ";")

        test_j_row = JRow(2)
        test_j_row.setField(0, "1")
        test_j_row.setField(1, "hello")

        field_assertion(Types.ROW([Types.STRING(),
                                   Types.STRING()]), "'1:hello'", test_j_row,
                        ";")
        test_j_row.setField(1, "hello world")
        field_assertion(Types.ROW([Types.STRING(),
                                   Types.STRING()]), "'1:hello world'",
                        test_j_row, ";")
        field_assertion(Types.STRING(), "null", "null", ";")
Beispiel #8
0
    def test_event_time_tumbling_window_all(self):
        data_stream = self.env.from_collection(
            [('hi', 1), ('hello', 2), ('hi', 3), ('hello', 4), ('hello', 5),
             ('hi', 8), ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .window_all(TumblingEventTimeWindows.of(Time.milliseconds(5))) \
            .process(CountAllWindowProcessFunction(),
                     Types.TUPLE([Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_tumbling_window_all')
        results = self.test_sink.get_results()
        expected = ['(0,5,4)', '(15,20,1)', '(5,10,3)']
        self.assert_equals_sorted(expected, results)
Beispiel #9
0
    def test_set_topic(self):
        input_type = Types.ROW([Types.STRING()])

        serialization_schema = KafkaRecordSerializationSchema.builder() \
            .set_topic('test-topic') \
            .set_value_serialization_schema(
                JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \
            .build()
        jvm = get_gateway().jvm
        serialization_schema._j_serialization_schema.open(
            jvm.org.apache.flink.connector.testutils.formats.
            DummyInitializationContext(),
            jvm.org.apache.flink.connector.kafka.sink.DefaultKafkaSinkContext(
                0, 1, jvm.java.util.Properties()))

        j_record = serialization_schema._j_serialization_schema.serialize(
            to_java_data_structure(Row('test')), None, None)
        self.assertEqual(j_record.topic(), 'test-topic')
        self.assertIsNone(j_record.key())
        self.assertEqual(j_record.value(), b'{"f0":"test"}')
    def test_add_jars(self):
        # find kafka connector jars
        flink_source_root = _find_flink_source_root()
        jars_abs_path = flink_source_root + '/flink-connectors/flink-sql-connector-kafka'
        specific_jars = glob.glob(jars_abs_path + '/target/flink*.jar')
        specific_jars = ['file://' + specific_jar for specific_jar in specific_jars]

        self.env.add_jars(*specific_jars)
        source_topic = 'test_source_topic'
        props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
        type_info = Types.ROW([Types.INT(), Types.STRING()])

        # Test for kafka consumer
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        # Will get a ClassNotFoundException if not add the kafka connector into the pipeline jars.
        kafka_consumer = FlinkKafkaConsumer(source_topic, deserialization_schema, props)
        self.env.add_source(kafka_consumer).print()
        self.env.get_execution_plan()
Beispiel #11
0
    def test_kinesis_firehose_sink(self):
        _load_specific_flink_module_jars(
            '/flink-connectors/'
            'flink-sql-connector-aws-kinesis-firehose')

        sink_properties = {
            'aws.region': 'eu-west-1',
            'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key'
        }

        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        kinesis_firehose_sink = KinesisFirehoseSink.builder() \
            .set_firehose_client_properties(sink_properties) \
            .set_serialization_schema(SimpleStringSchema()) \
            .set_delivery_stream_name('stream-1') \
            .set_fail_on_error(False) \
            .set_max_batch_size(500) \
            .set_max_in_flight_requests(50) \
            .set_max_buffered_requests(10000) \
            .set_max_batch_size_in_bytes(5 * 1024 * 1024) \
            .set_max_time_in_buffer_ms(5000) \
            .set_max_record_size_in_bytes(1 * 1024 * 1024) \
            .build()

        ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink')
        plan = eval(self.env.get_execution_plan())

        self.assertEqual('kinesis firehose sink: Writer',
                         plan['nodes'][1]['type'])
        self.assertEqual(
            get_field_value(kinesis_firehose_sink.get_java_function(),
                            'failOnError'), False)
        self.assertEqual(
            get_field_value(kinesis_firehose_sink.get_java_function(),
                            'deliveryStreamName'), 'stream-1')
Beispiel #12
0
 def test_source_deprecated_method(self):
     test_option = ConfigOptions.key('pulsar.source.enableAutoAcknowledgeMessage') \
         .boolean_type().no_default_value()
     pulsar_source = PulsarSource.builder() \
         .set_service_url('pulsar://localhost:6650') \
         .set_admin_url('http://localhost:8080') \
         .set_topics('ada') \
         .set_deserialization_schema(
             PulsarDeserializationSchema.flink_type_info(Types.STRING(), None)) \
         .set_subscription_name('ff') \
         .set_config(test_option, True) \
         .set_config_with_dict({'pulsar.source.autoCommitCursorInterval': '1000'}) \
         .build()
     configuration = get_field_value(pulsar_source.get_java_function(),
                                     "sourceConfiguration")
     self.assertEqual(
         configuration.getBoolean(test_option._j_config_option), True)
     self.assertEqual(
         configuration.getLong(
             ConfigOptions.key('pulsar.source.autoCommitCursorInterval').
             long_type().no_default_value()._j_config_option), 1000)
Beispiel #13
0
 def test_map_function_without_data_types(self):
     self.env.set_parallelism(1)
     ds = self.env.from_collection([('ab', decimal.Decimal(1)),
                                    ('bdc', decimal.Decimal(2)),
                                    ('cfgs', decimal.Decimal(3)),
                                    ('deeefg', decimal.Decimal(4))],
                                   type_info=Types.ROW(
                                       [Types.STRING(),
                                        Types.BIG_DEC()]))
     mapped_stream = ds.map(MyMapFunction())
     collect_util = DataStreamCollectUtil()
     collect_util.collect(mapped_stream)
     self.env.execute('map_function_test')
     results = collect_util.results()
     expected = [
         "('ab', 2, Decimal('1'))", "('bdc', 3, Decimal('2'))",
         "('cfgs', 4, Decimal('3'))", "('deeefg', 6, Decimal('4'))"
     ]
     expected.sort()
     results.sort()
     self.assertEqual(expected, results)
 def test_to_retract_stream(self):
     self.env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.in_streaming_mode())
     table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")],
                                 ["a", "b", "c"])
     new_table = table.group_by("c").select("a.sum, c as b")
     ds = t_env.to_retract_stream(table=new_table,
                                  type_info=Types.ROW(
                                      [Types.LONG(),
                                       Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.map(lambda x: x).add_sink(test_sink)
     self.env.execute("test_to_retract_stream")
     result = test_sink.get_results(True)
     expected = [
         "(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))",
         "(True, Row(f0=2, f1='Hello'))"
     ]
     self.assertEqual(result, expected)
Beispiel #15
0
    def test_rabbitmq_connectors(self):
        connection_config = RMQConnectionConfig.Builder() \
            .set_host('localhost') \
            .set_port(5672) \
            .set_virtual_host('/') \
            .set_user_name('guest') \
            .set_password('guest') \
            .build()
        type_info = Types.ROW([Types.INT(), Types.STRING()])
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        rmq_source = RMQSource(
            connection_config, 'source_queue', True, deserialization_schema)
        self.assertEqual(
            get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue')
        self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId'))

        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema)
        self.assertEqual(
            get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
Beispiel #16
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD(
            'string_array',
            DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList')
        ),
        DataTypes.FIELD(
            'int_array',
            DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList')
        ),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, data
Beispiel #17
0
    def _check_serialization_schema_implementations(check_function):
        input_type = Types.ROW([Types.STRING()])

        check_function(
            JsonRowSerializationSchema.builder().with_type_info(
                input_type).build(),
            'org.apache.flink.formats.json.JsonRowSerializationSchema')
        check_function(
            CsvRowSerializationSchema.Builder(input_type).build(),
            'org.apache.flink.formats.csv.CsvRowSerializationSchema')
        avro_schema_string = """
        {
            "type": "record",
            "name": "test_record",
            "fields": []
        }
        """
        check_function(
            AvroRowSerializationSchema(avro_schema_string=avro_schema_string),
            'org.apache.flink.formats.avro.AvroRowSerializationSchema')
        check_function(
            SimpleStringSchema(),
            'org.apache.flink.api.common.serialization.SimpleStringSchema')
Beispiel #18
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    type_info = Types.ROW_NAMED(
        ['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'],
        [Types.LONG(),
         Types.LONG(),
         Types.DOUBLE(),
         Types.INT(),
         Types.INT()])
    json_row_schema = JsonRowDeserializationSchema.builder().type_info(
        type_info).build()
    kafka_props = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'pyflink-e2e-source'
    }

    kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema,
                                        kafka_props)
    kafka_producer = FlinkKafkaProducer("timer-stream-sink",
                                        SimpleStringSchema(), kafka_props)

    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\
        .with_timestamp_assigner(KafkaRowTimestampAssigner())

    kafka_consumer.set_start_from_earliest()
    ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(
        watermark_strategy)
    ds.key_by(MyKeySelector(), key_type=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
Beispiel #19
0
    def test_kinesis_streams_sink(self):
        sink_properties = {
            'aws.region': 'us-east-1',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key'
        }

        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        kinesis_streams_sink = KinesisStreamsSink.builder() \
            .set_kinesis_client_properties(sink_properties) \
            .set_serialization_schema(SimpleStringSchema()) \
            .set_partition_key_generator(PartitionKeyGenerator.fixed()) \
            .set_stream_name("stream-1") \
            .set_fail_on_error(False) \
            .set_max_batch_size(500) \
            .set_max_in_flight_requests(50) \
            .set_max_buffered_requests(10000) \
            .set_max_batch_size_in_bytes(5 * 1024 * 1024) \
            .set_max_time_in_buffer_ms(5000) \
            .set_max_record_size_in_bytes(1 * 1024 * 1024) \
            .build()

        ds.sink_to(kinesis_streams_sink).name('kinesis streams sink')
        plan = eval(self.env.get_execution_plan())

        self.assertEqual('kinesis streams sink: Writer',
                         plan['nodes'][1]['type'])
        self.assertEqual(
            get_field_value(kinesis_streams_sink.get_java_function(),
                            'failOnError'), False)
        self.assertEqual(
            get_field_value(kinesis_streams_sink.get_java_function(),
                            'streamName'), 'stream-1')
Beispiel #20
0
    def test_from_java_type(self):
        basic_int_type_info = Types.INT()
        self.assertEqual(basic_int_type_info,
                         _from_java_type(basic_int_type_info.get_java_type_info()))

        basic_short_type_info = Types.SHORT()
        self.assertEqual(basic_short_type_info,
                         _from_java_type(basic_short_type_info.get_java_type_info()))

        basic_long_type_info = Types.LONG()
        self.assertEqual(basic_long_type_info,
                         _from_java_type(basic_long_type_info.get_java_type_info()))

        basic_float_type_info = Types.FLOAT()
        self.assertEqual(basic_float_type_info,
                         _from_java_type(basic_float_type_info.get_java_type_info()))

        basic_double_type_info = Types.DOUBLE()
        self.assertEqual(basic_double_type_info,
                         _from_java_type(basic_double_type_info.get_java_type_info()))

        basic_char_type_info = Types.CHAR()
        self.assertEqual(basic_char_type_info,
                         _from_java_type(basic_char_type_info.get_java_type_info()))

        basic_byte_type_info = Types.BYTE()
        self.assertEqual(basic_byte_type_info,
                         _from_java_type(basic_byte_type_info.get_java_type_info()))

        basic_big_int_type_info = Types.BIG_INT()
        self.assertEqual(basic_big_int_type_info,
                         _from_java_type(basic_big_int_type_info.get_java_type_info()))

        basic_big_dec_type_info = Types.BIG_DEC()
        self.assertEqual(basic_big_dec_type_info,
                         _from_java_type(basic_big_dec_type_info.get_java_type_info()))

        basic_sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(basic_sql_date_type_info,
                         _from_java_type(basic_sql_date_type_info.get_java_type_info()))

        basic_sql_time_type_info = Types.SQL_TIME()
        self.assertEqual(basic_sql_time_type_info,
                         _from_java_type(basic_sql_time_type_info.get_java_type_info()))

        basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP()
        self.assertEqual(basic_sql_timestamp_type_info,
                         _from_java_type(basic_sql_timestamp_type_info.get_java_type_info()))

        row_type_info = Types.ROW([Types.INT(), Types.STRING()])
        self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info()))

        tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()])
        self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info()))

        primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT())
        self.assertEqual(primitive_int_array_type_info,
                         _from_java_type(primitive_int_array_type_info.get_java_type_info()))

        object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE())
        self.assertEqual(object_array_type_info,
                         _from_java_type(object_array_type_info.get_java_type_info()))

        pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY()
        self.assertEqual(pickled_byte_array_type_info,
                         _from_java_type(pickled_byte_array_type_info.get_java_type_info()))

        sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(sql_date_type_info,
                         _from_java_type(sql_date_type_info.get_java_type_info()))

        map_type_info = Types.MAP(Types.INT(), Types.STRING())
        self.assertEqual(map_type_info,
                         _from_java_type(map_type_info.get_java_type_info()))

        list_type_info = Types.LIST(Types.INT())
        self.assertEqual(list_type_info,
                         _from_java_type(list_type_info.get_java_type_info()))
Beispiel #21
0
    def test_row_type(self):
        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()])
                         .get_field_names(), ['f0', 'f1'])
        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()],
                                     ['a', 'b']).get_field_names(), ['a', 'b'])

        self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()],
                                     ['a', 'b']) == RowTypeInfo([Types.STRING(),
                                                                 Types.STRING()], ['a', 'b']), True)
        self.assertEqual(RowTypeInfo([Types.STRING(),
                                      Types.STRING()],
                                     ['a', 'b']) == RowTypeInfo([Types.STRING(),
                                                                Types.INT()],
                                                                ['a', 'b']), False)
        self.assertEqual(RowTypeInfo([Types.STRING(),
                                      Types.STRING()],
                                     ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)")

        self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]),
                         RowTypeInfo([Types.STRING(), Types.STRING()]), True)

        self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()])
                         .get_field_names(), ['a', 'b'], True)

        self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()])
                         .get_field_types(), [Types.STRING(), Types.STRING()], True)
Beispiel #22
0
def ds_operators():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_python_executable(
        r"D:/ProgramData/Anaconda3/envs/penter/python.exe")
    ds = s_env.from_collection(
        [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    """
    map
    flat_map
    filter
    key_by DataStream → KeyedStream
    reduce KeyedStream → DataStream
    union DataStream* → DataStream
    connect DataStream,DataStream → ConnectedStreams
    转换元组:
    project
    分区:
    partition_custom 自定义分区
    shuffle 随机分区 根据均匀分布随机划分元素。
    rebalance 轮询分区
    rescale 重新分区
    broadcast 向每个分区广播元素
    随意定制
    process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。
    其它
    start_new_chain
    disable_chaining
    slot_sharing_group
    """
    ds.rescale()
    ds.map()
    ds.flat_map()
    ds.filter()
    # KeyBy DataStream → KeyedStream
    # Reduce KeyedStream → DataStream
    ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')],
                               type_info=Types.ROW(
                                   [Types.INT(), Types.STRING()]))
    ds.key_by(lambda a: a[1]) \
        .reduce(lambda a, b: Row(a[0] + b[0], b[1]))
    # 广播
    ds.broadcast()
    # project 只有元组ds才可以
    ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]],
                               type_info=Types.TUPLE([
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT(),
                                   Types.INT()
                               ]))
    # 输出元组的1,3索引
    ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink()

    # 存储
    ds.add_sink(
        StreamingFileSink.for_row_format(
            '/tmp/output', SimpleStringEncoder()).with_rolling_policy(
                DefaultRollingPolicy.builder().with_rollover_interval(
                    15 * 60 * 1000).with_inactivity_interval(
                        5 * 60 * 1000).with_max_part_size(1024 * 1024 *
                                                          1024).build()).
        with_output_file_config(
            OutputFileConfig.OutputFileConfigBuilder().with_part_prefix(
                "prefix").with_part_suffix("suffix").build()).build())
    s_env.execute('ds_operators')
Beispiel #23
0
    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection(
            [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
            Types.ROW_NAMED(
                ["a", "b", "c"],
                [Types.LONG(), Types.INT(),
                 Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps(
            ).with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(
            ds,
            Schema.new_builder().column_by_metadata(
                "rowtime",
                "TIMESTAMP_LTZ(3)").watermark("rowtime",
                                              "SOURCE_WATERMARK()").build())
        self.assertEqual(
            """(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
            table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view(
            "t", ds,
            Schema.new_builder().column_by_metadata(
                "rowtime",
                "TIMESTAMP_LTZ(3)").watermark("rowtime",
                                              "SOURCE_WATERMARK()").build())

        result = self.t_env.execute_sql(
            "SELECT "
            "c, SUM(b) "
            "FROM t "
            "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [
                item for item in map(
                    str, [Row('a', 47),
                          Row('c', 1000),
                          Row('c', 1000)])
            ]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)
    def test_generate_stream_graph_with_dependencies(self):
        python_file_dir = os.path.join(self.tempdir,
                                       "python_file_dir_" + str(uuid.uuid4()))
        os.mkdir(python_file_dir)
        python_file_path = os.path.join(
            python_file_dir, "test_stream_dependency_manage_lib.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_two(a):\n    return a + 2")
        env = self.env
        env.add_python_file(python_file_path)

        def plus_two_map(value):
            from test_stream_dependency_manage_lib import add_two
            return value[0], add_two(value[1])

        def add_from_file(i):
            with open("data/data.txt", 'r') as f:
                return i[0], i[1] + int(f.read())

        from_collection_source = env.from_collection(
            [('a', 0), ('b', 0), ('c', 1), ('d', 1), ('e', 2)],
            type_info=Types.ROW([Types.STRING(), Types.INT()]))
        from_collection_source.name("From Collection")
        keyed_stream = from_collection_source.key_by(lambda x: x[1],
                                                     key_type=Types.INT())

        plus_two_map_stream = keyed_stream.map(plus_two_map).name(
            "Plus Two Map").set_parallelism(3)

        add_from_file_map = plus_two_map_stream.map(add_from_file).name(
            "Add From File Map")

        test_stream_sink = add_from_file_map.add_sink(
            self.test_sink).name("Test Sink")
        test_stream_sink.set_parallelism(4)

        archive_dir_path = os.path.join(self.tempdir,
                                        "archive_" + str(uuid.uuid4()))
        os.mkdir(archive_dir_path)
        with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f:
            f.write("3")
        archive_file_path = \
            shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path)
        env.add_python_archive(archive_file_path, "data")

        nodes = eval(env.get_execution_plan())['nodes']

        # The StreamGraph should be as bellow:
        # Source: From Collection -> _stream_key_by_map_operator ->
        # Plus Two Map -> Add From File Map -> Sink: Test Sink.

        # Source: From Collection and _stream_key_by_map_operator should have same parallelism.
        self.assertEqual(nodes[0]['parallelism'], nodes[1]['parallelism'])

        # The parallelism of Plus Two Map should be 3
        self.assertEqual(nodes[2]['parallelism'], 3)

        # The ship_strategy for Source: From Collection and _stream_key_by_map_operator should be
        # FORWARD
        self.assertEqual(nodes[1]['predecessors'][0]['ship_strategy'],
                         "FORWARD")

        # The ship_strategy for _keyed_stream_values_operator and Plus Two Map should be
        # HASH
        self.assertEqual(nodes[2]['predecessors'][0]['ship_strategy'], "HASH")

        # The parallelism of Sink: Test Sink should be 4
        self.assertEqual(nodes[4]['parallelism'], 4)

        python_dependency_config = dict(
            get_gateway().jvm.org.apache.flink.python.util.
            PythonDependencyUtils.configurePythonDependencies(
                env._j_stream_execution_environment.getCachedFiles(),
                env._j_stream_execution_environment.getConfiguration()).toMap(
                ))

        # Make sure that user specified files and archives are correctly added.
        self.assertIsNotNone(
            python_dependency_config['python.internal.files-key-map'])
        self.assertIsNotNone(
            python_dependency_config['python.internal.archives-key-map'])
    def test_from_collection_with_data_types(self):
        # verify from_collection for the collection with single object.
        ds = self.env.from_collection(['Hi', 'Hello'],
                                      type_info=Types.STRING())
        ds.add_sink(self.test_sink)
        self.env.execute("test from collection with single object")
        results = self.test_sink.get_results(False)
        expected = ['Hello', 'Hi']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)

        # verify from_collection for the collection with multiple objects like tuple.
        ds = self.env.from_collection(
            [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932,
              bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13),
              datetime.time(hour=12, minute=0, second=0, microsecond=123000),
              datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [1, 2, 3],
              decimal.Decimal('1000000000000000000.05'),
              decimal.Decimal('1000000000000000000.0599999999999'
                              '9999899999999999')),
             (2, None, 2, True, 43878, 9147483648, 9.87, 2.98936,
              bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14),
              datetime.time(hour=11, minute=2, second=2, microsecond=234500),
              datetime.datetime(2020, 4, 15, 8, 2, 6, 235000), [2, 4, 6],
              decimal.Decimal('2000000000000000000.74'),
              decimal.Decimal('2000000000000000000.061111111111111'
                              '11111111111111'))],
            type_info=Types.ROW([
                Types.LONG(),
                Types.LONG(),
                Types.SHORT(),
                Types.BOOLEAN(),
                Types.SHORT(),
                Types.INT(),
                Types.FLOAT(),
                Types.DOUBLE(),
                Types.PICKLED_BYTE_ARRAY(),
                Types.STRING(),
                Types.SQL_DATE(),
                Types.SQL_TIME(),
                Types.SQL_TIMESTAMP(),
                Types.BASIC_ARRAY(Types.LONG()),
                Types.BIG_DEC(),
                Types.BIG_DEC()
            ]))
        ds.add_sink(self.test_sink)
        self.env.execute("test from collection with tuple object")
        results = self.test_sink.get_results(False)
        # if user specifies data types of input data, the collected result should be in row format.
        expected = [
            '+I[1, null, 1, true, 32767, -2147483648, 1.23, 1.98932, [102, 108, 105, 110, 107], '
            'pyflink, 2014-09-13, 12:00:00, 2018-03-11 03:00:00.123, [1, 2, 3], '
            '1000000000000000000.05, 1000000000000000000.05999999999999999899999999999]',
            '+I[2, null, 2, true, -21658, 557549056, 9.87, 2.98936, [102, 108, 105, 110, 107], '
            'pyflink, 2015-10-14, 11:02:02, 2020-04-15 08:02:06.235, [2, 4, 6], '
            '2000000000000000000.74, 2000000000000000000.06111111111111111111111111111]'
        ]
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
Beispiel #26
0
def demo01():
    # 创建一个执行环境,该环境表示程序当前正在执行。如果程序是独立调用的,则方法返回本地执行环境。
    # 1:创建一个流处理的执行环境,如果在本地启动则创建本地执行环境,如果在集群启动则创建集群执行环境
    env = StreamExecutionEnvironment.get_execution_environment()

    # 添加添加到程序的每个用户代码类加载器的类路径中的url列表。路径必须指定一个协议(例如file://),并且可以在所有节点上访问
    env.add_classpaths("file://lib")

    # 添加将被上传到集群并由作业引用的jar文件列表。 .set_string("pipeline.jars", 'file://' + dir_kafka_sql_connect)
    env.add_jars("file://jars")

    # 添加python存档文件。该文件将被解压到python UDF worker的工作目录中。
    # 目前只支持zip格式,例如zip、jar、whl、egg等
    # 会先解压zip -r py_env.zip py_env.zip
    env.add_python_archive("py_env.zip")
    # 如果python UDF依赖于集群中不存在的特定python版本,则可以使用此方法上传虚拟环境。注意,上传环境中包含的python解释器的路径应该通过该方法指定
    env.set_python_executable("py_env.zip/py_env/bin/python")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    # or
    env.add_python_archive("py_env.zip", "myenv")
    env.set_python_executable("myenv/py_env/bin/python")
    # the files contained in the archive file can be accessed in UDF
    """
    def my_udf():
        with open("myenv/py_env/data/data.txt") as f:
            ...
    """
    # 相当于 pip download -d cached_dir -r requirements.txt --no-binary :all:
    env.set_python_requirements("requirements.txt", "cached_dir")
    # 添加一个python依赖项,它可以是python文件、python包或本地目录。它们将被添加到python UDF工作者的PYTHONPATH中。请确保可以导入这些依赖项。
    env.add_python_file("")

    # 添加source
    #1. add_source
    ds = env.add_source(
        FlinkKafkaConsumer(
            "source_topic",
            JsonRowDeserializationSchema.builder().type_info(
                type_info=Types.ROW([Types.INT(), Types.STRING()])).build(), {
                    'bootstrap.servers': 'localhost:9092',
                    'group.id': 'test_group'
                }))
    # 2. from_collection
    ds = env.from_collection([
        1,
        2,
        3,
    ], Types.INT())
    # 3. 从文件
    ds = env.read_text_file("hdfs://host:port/file/path")

    # 禁用operator chaining
    env.disable_operator_chaining()
    """
    Flink 可以非常高效的进行有状态流的计算,通过使用 Flink 内置的 Keyed State 和 Operator State,保存每个算子的状态。
    默认情况下,状态是存储在 JVM 的堆内存中,如果系统中某个环节发生了错误,宕机,这个时候所有的状态都会丢失,并且无法恢复,会导致整个系统的数据计算发生错误。
    此时就需要 Checkpoint 来保障系统的容错。Checkpoint 过程,就是把算子的状态周期性持久化的过程。
    在系统出错后恢复时,就可以从 checkpoint 中恢复每个算子的状态,从上次消费的地方重新开始消费和计算。从而可以做到在高效进行计算的同时还可以保证数据不丢失,只计算一次。
    
    最少一次
    AT_LEAST_ONCE
    如果假定是传输过程出现问题,而服务器没有收到数据,这样time out之后重传数据。但这可能是返回成功消息的时候出问题,而此时服务器已经收到数据,这样会因为重传而收到多份数据,这就是 at least once
    
    严格一次
    EXACTLY_ONCE
    
    最多一次(At-most-once)、最少一次(At-least-once),以及严格一次(Exactly-once)
    
    Checkpoint 必要的两个条件
    1. 需要支持重放一定时间范围内数据的数据源,比如:kafka 。
    因为容错机制就是在任务失败后自动从最近一次成功的 checkpoint 处恢复任务,此时需要把任务失败前消费的数据再消费一遍。
    假设数据源不支持重放,那么数据还未写到存储中就丢了,任务恢复后,就再也无法重新消费这部分丢了的数据了。
    
    2. 需要一个存储来保存持久化的状态,如:Hdfs,本地文件。可以在任务失败后,从存储中恢复 checkpoint 数据。
    
    https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/stream/state/checkpointing.html
    https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.datastream.html#pyflink.datastream.CheckpointConfig
    """
    # 每 300s 做一次 checkpoint
    env.enable_checkpointing(300000, CheckpointingMode.AT_LEAST_ONCE)
    # MemoryStateBackend FsStateBackend CustomStateBackend
    env.set_state_backend(RocksDBStateBackend("file://var/checkpoints/"))

    # set mode to exactly-once (this is the default)
    env.get_checkpoint_config().set_checkpointing_mode(
        CheckpointingMode.EXACTLY_ONCE)

    # 两次 checkpoint 的间隔时间至少为 500ms,默认是 0,立即进行下一次 checkpoint make sure 500 ms of progress happen between checkpoints
    env.get_checkpoint_config().set_min_pause_between_checkpoints(500)

    # checkpoint 必须在 60s 内结束,否则被丢弃 checkpoints have to complete within one minute, or are discarded
    env.get_checkpoint_config().set_checkpoint_timeout(60000)

    # 同一时间只能允许有一个 checkpoint allow only one checkpoint to be in progress at the same time
    env.get_checkpoint_config().set_max_concurrent_checkpoints(1)

    # 当 Flink 任务取消时,保留外部保存的 checkpoint 信息 enable externalized checkpoints which are retained after job cancellation
    env.get_checkpoint_config().enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

    # 当有较新的 Savepoint 时,作业也会从 Checkpoint 处恢复 allow job recovery fallback to checkpoint when there is a more recent savepoint
    env.get_checkpoint_config().set_prefer_checkpoint_for_recovery(True)

    # 允许实验性的功能:非对齐的 checkpoint,以提升性能 enables the  experimental  unaligned checkpoints
    #  CheckpointingMode.EXACTLY_ONCE时才能启用
    env.get_checkpoint_config().enable_unaligned_checkpoints()
    # env.get_checkpoint_config().disable_unaligned_checkpoints() 等同env.get_checkpoint_config().enable_unaligned_checkpoints(False)

    env.get_checkpoint_interval(
    )  #等同 env.get_checkpoint_config().get_checkpoint_interval()
    """
    """
    # https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.common.html#pyflink.common.ExecutionConfig
    # bin/flink run -Dexecution.runtime-mode=BATCH examples/streaming/WordCount.jar
    env.get_config().set_execution_mode(ExecutionMode.BATCH)

    env.get_config().disable_auto_generated_uids(
    )  # enable_auto_generated_uids
    # 自己设置uid
    ds.uid("xx")

    # 设置从此环境创建的所有流的时间特性,例如,处理时间、事件时间或摄取时间。
    # 如果将特征设置为EventTime的incertiontime,则将设置默认值水印更新间隔为200毫秒。
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)  #设置时间分配器
    env.get_config().set_auto_watermark_interval(200)  # 每200ms发出一个watermark

    env.get_config().set_global_job_parameters(
        {"environment.checkpoint_interval": "1000"})

    env.get_config().set_restart_strategy(
        RestartStrategies.fixed_delay_restart(10, 1000))

    # 执行
    env.execute("job name")
    # 异步执行
    jobClient = env.execute_async("job name")
    jobClient.get_job_execution_result().result()
    """
    设置输出缓冲区刷新的最大时间频率(毫秒)。默认情况下,输出缓冲区会频繁刷新,以提供较低的延迟,并帮助流畅的开发人员体验。设置该参数可以产生三种逻辑模式:
    正整数触发该整数周期性刷新
    0 触发每个记录之后的刷新,从而最大限度地减少延迟(最好不要设置为0 可以设置一个接近0的数值,比如5或者10)
    -1 仅在输出缓冲区已满时才触发刷新,从而最大化吞吐量
    """
    # 输出缓冲区刷新的最大时间频率(毫秒)
    env.get_buffer_timeout()
    env.set_buffer_timeout(10)

    # 获取执行计划的json,复制到https://flink.apache.org/visualizer/
    env.get_execution_plan()
Beispiel #27
0
    def test_pulsar_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled'
        pulsar_sink = PulsarSink.builder() \
            .set_service_url('pulsar://localhost:6650') \
            .set_admin_url('http://localhost:8080') \
            .set_producer_name('fo') \
            .set_topics('ada') \
            .set_serialization_schema(
                PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \
            .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
            .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \
            .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \
            .set_config(TEST_OPTION_NAME, True) \
            .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \
            .build()

        ds.sink_to(pulsar_sink).name('pulsar sink')

        plan = eval(self.env.get_execution_plan())
        self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type'])
        configuration = get_field_value(pulsar_sink.get_java_function(),
                                        "sinkConfiguration")
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.client.serviceUrl').string_type().
                no_default_value()._j_config_option),
            'pulsar://localhost:6650')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.admin.adminUrl').string_type().
                no_default_value()._j_config_option), 'http://localhost:8080')
        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.producer.producerName').string_type(
                ).no_default_value()._j_config_option), 'fo - %s')

        j_pulsar_serialization_schema = get_field_value(
            pulsar_sink.get_java_function(), 'serializationSchema')
        j_serialization_schema = get_field_value(j_pulsar_serialization_schema,
                                                 'serializationSchema')
        self.assertTrue(
            is_instance_of(
                j_serialization_schema,
                'org.apache.flink.api.common.serialization.SimpleStringSchema')
        )

        self.assertEqual(
            configuration.getString(
                ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type(
                ).no_default_value()._j_config_option), 'at-least-once')

        j_topic_router = get_field_value(pulsar_sink.get_java_function(),
                                         "topicRouter")
        self.assertTrue(
            is_instance_of(
                j_topic_router,
                'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter'
            ))

        j_message_delayer = get_field_value(pulsar_sink.get_java_function(),
                                            'messageDelayer')
        delay_duration = get_field_value(j_message_delayer, 'delayDuration')
        self.assertEqual(delay_duration, 12000)

        test_option = ConfigOptions.key(
            TEST_OPTION_NAME).boolean_type().no_default_value()
        self.assertEqual(
            configuration.getBoolean(test_option._j_config_option), True)
        self.assertEqual(
            configuration.getLong(
                ConfigOptions.key('pulsar.producer.batchingMaxMessages').
                long_type().no_default_value()._j_config_option), 100)
Beispiel #28
0
    def test_es_sink(self):
        ds = self.env.from_collection([{
            'name': 'ada',
            'id': '1'
        }, {
            'name': 'luna',
            'id': '2'
        }],
                                      type_info=Types.MAP(
                                          Types.STRING(), Types.STRING()))

        es_sink = Elasticsearch7SinkBuilder() \
            .set_emitter(ElasticsearchEmitter.static_index('foo', 'id')) \
            .set_hosts(['localhost:9200']) \
            .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \
            .set_bulk_flush_max_actions(1) \
            .set_bulk_flush_max_size_mb(2) \
            .set_bulk_flush_interval(1000) \
            .set_bulk_flush_backoff_strategy(FlushBackoffType.CONSTANT, 3, 3000) \
            .set_connection_username('foo') \
            .set_connection_password('bar') \
            .set_connection_path_prefix('foo-bar') \
            .set_connection_request_timeout(30000) \
            .set_connection_timeout(31000) \
            .set_socket_timeout(32000) \
            .build()

        j_emitter = get_field_value(es_sink.get_java_function(), 'emitter')
        self.assertTrue(
            is_instance_of(
                j_emitter,
                'org.apache.flink.connector.elasticsearch.sink.SimpleElasticsearchEmitter'
            ))
        self.assertEqual(
            get_field_value(es_sink.get_java_function(),
                            'hosts')[0].toString(), 'http://localhost:9200')
        self.assertEqual(
            get_field_value(es_sink.get_java_function(),
                            'deliveryGuarantee').toString(), 'at-least-once')

        j_build_bulk_processor_config = get_field_value(
            es_sink.get_java_function(), 'buildBulkProcessorConfig')
        self.assertEqual(
            j_build_bulk_processor_config.getBulkFlushMaxActions(), 1)
        self.assertEqual(j_build_bulk_processor_config.getBulkFlushMaxMb(), 2)
        self.assertEqual(j_build_bulk_processor_config.getBulkFlushInterval(),
                         1000)
        self.assertEqual(
            j_build_bulk_processor_config.getFlushBackoffType().toString(),
            'CONSTANT')
        self.assertEqual(
            j_build_bulk_processor_config.getBulkFlushBackoffRetries(), 3)
        self.assertEqual(
            j_build_bulk_processor_config.getBulkFlushBackOffDelay(), 3000)

        j_network_client_config = get_field_value(es_sink.get_java_function(),
                                                  'networkClientConfig')
        self.assertEqual(j_network_client_config.getUsername(), 'foo')
        self.assertEqual(j_network_client_config.getPassword(), 'bar')
        self.assertEqual(j_network_client_config.getConnectionRequestTimeout(),
                         30000)
        self.assertEqual(j_network_client_config.getConnectionTimeout(), 31000)
        self.assertEqual(j_network_client_config.getSocketTimeout(), 32000)
        self.assertEqual(j_network_client_config.getConnectionPathPrefix(),
                         'foo-bar')

        ds.sink_to(es_sink).name('es sink')
Beispiel #29
0
import json

from pyflink.common.serialization import SimpleStringSchema, SimpleStringEncoder, JsonRowDeserializationSchema
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer, StreamingFileSink
from pyflink.common.typeinfo import Types
from pyflink.datastream.functions import MapFunction

s_env = StreamExecutionEnvironment.get_execution_environment()
s_env.set_parallelism(1)
ti = Types.ROW_NAMED(
    ["app", 'busi', 'date', 'ip'],
    [Types.STRING(),
     Types.STRING(),
     Types.BIG_INT(),
     Types.STRING()])
builder = JsonRowDeserializationSchema.builder()
builder.type_info(ti)
jrds = builder.ignore_parse_errors().build()
fkc = FlinkKafkaConsumer(topics="ULS-BUSI-LOG-dev",
                         deserialization_schema=jrds,
                         properties={
                             "bootstrap.servers": "10.100.1.16:9192",
                             "group.id": "123",
                             "auto.offset.reset": "earliest"
                         })
fkc.set_start_from_earliest()
src = s_env.add_source(fkc).map(lambda x: x.get("values"))
src.add_sink(
    StreamingFileSink.for_row_format('C:\\tmp\\pyoutput',
                                     SimpleStringEncoder()).build())
Beispiel #30
0
def to_java_typeinfo(type_info: TypeInformation):
    if isinstance(type_info, BasicTypeInfo):
        basic_type = type_info._basic_type

        if basic_type == BasicType.STRING:
            j_typeinfo = JTypes.STRING
        elif basic_type == BasicType.BYTE:
            j_typeinfo = JTypes.LONG
        elif basic_type == BasicType.BOOLEAN:
            j_typeinfo = JTypes.BOOLEAN
        elif basic_type == BasicType.SHORT:
            j_typeinfo = JTypes.LONG
        elif basic_type == BasicType.INT:
            j_typeinfo = JTypes.LONG
        elif basic_type == BasicType.LONG:
            j_typeinfo = JTypes.LONG
        elif basic_type == BasicType.FLOAT:
            j_typeinfo = JTypes.DOUBLE
        elif basic_type == BasicType.DOUBLE:
            j_typeinfo = JTypes.DOUBLE
        elif basic_type == BasicType.CHAR:
            j_typeinfo = JTypes.STRING
        elif basic_type == BasicType.BIG_INT:
            j_typeinfo = JTypes.BIG_INT
        elif basic_type == BasicType.BIG_DEC:
            j_typeinfo = JTypes.BIG_DEC
        elif basic_type == BasicType.INSTANT:
            j_typeinfo = JTypes.INSTANT
        else:
            raise TypeError("Invalid BasicType %s." % basic_type)

    elif isinstance(type_info, PrimitiveArrayTypeInfo):
        element_type = type_info._element_type

        if element_type == Types.BOOLEAN():
            j_typeinfo = JPrimitiveArrayTypeInfo.BOOLEAN_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.BYTE():
            j_typeinfo = JPrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.SHORT():
            j_typeinfo = JPrimitiveArrayTypeInfo.SHORT_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.INT():
            j_typeinfo = JPrimitiveArrayTypeInfo.INT_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.LONG():
            j_typeinfo = JPrimitiveArrayTypeInfo.LONG_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.FLOAT():
            j_typeinfo = JPrimitiveArrayTypeInfo.FLOAT_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.DOUBLE():
            j_typeinfo = JPrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO
        elif element_type == Types.CHAR():
            j_typeinfo = JPrimitiveArrayTypeInfo.CHAR_PRIMITIVE_ARRAY_TYPE_INFO
        else:
            raise TypeError("Invalid element type for a primitive array.")

    elif isinstance(type_info, BasicArrayTypeInfo):
        element_type = type_info._element_type

        if element_type == Types.BOOLEAN():
            j_typeinfo = JBasicArrayTypeInfo.BOOLEAN_ARRAY_TYPE_INFO
        elif element_type == Types.BYTE():
            j_typeinfo = JBasicArrayTypeInfo.BYTE_ARRAY_TYPE_INFO
        elif element_type == Types.SHORT():
            j_typeinfo = JBasicArrayTypeInfo.SHORT_ARRAY_TYPE_INFO
        elif element_type == Types.INT():
            j_typeinfo = JBasicArrayTypeInfo.INT_ARRAY_TYPE_INFO
        elif element_type == Types.LONG():
            j_typeinfo = JBasicArrayTypeInfo.LONG_ARRAY_TYPE_INFO
        elif element_type == Types.FLOAT():
            j_typeinfo = JBasicArrayTypeInfo.FLOAT_ARRAY_TYPE_INFO
        elif element_type == Types.DOUBLE():
            j_typeinfo = JBasicArrayTypeInfo.DOUBLE_ARRAY_TYPE_INFO
        elif element_type == Types.CHAR():
            j_typeinfo = JBasicArrayTypeInfo.CHAR_ARRAY_TYPE_INFO
        elif element_type == Types.STRING():
            j_typeinfo = JBasicArrayTypeInfo.STRING_ARRAY_TYPE_INFO
        else:
            raise TypeError("Invalid element type for a basic array.")

    elif isinstance(type_info, ObjectArrayTypeInfo):
        element_type = type_info._element_type

        j_typeinfo = JTypes.OBJECT_ARRAY(to_java_typeinfo(element_type))

    elif isinstance(type_info, MapTypeInfo):
        j_key_typeinfo = to_java_typeinfo(type_info._key_type_info)
        j_value_typeinfo = to_java_typeinfo(type_info._value_type_info)

        j_typeinfo = JMapTypeInfo(j_key_typeinfo, j_value_typeinfo)
    else:
        j_typeinfo = JPickledByteArrayTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO

    return j_typeinfo