Beispiel #1
0
    def test_event_time_sliding_window(self):
        data_stream = self.env.from_collection(
            [('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8),
             ('hi', 9), ('hi', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(2))) \
            .process(CountWindowProcessFunction(),
                     Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \
            .add_sink(self.test_sink)

        self.env.execute('test_event_time_sliding_window')
        results = self.test_sink.get_results()
        expected = [
            '(hi,-2,3,2)', '(hi,0,5,4)', '(hi,2,7,4)', '(hi,4,9,3)',
            '(hi,6,11,2)', '(hi,8,13,2)', '(hi,12,17,1)', '(hi,14,19,1)'
        ]
        self.assert_equals_sorted(expected, results)
 def test_reduce_function_without_data_types(self):
     ds = self.env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')],
                                   type_info=Types.ROW(
                                       [Types.INT(),
                                        Types.STRING()]))
     ds.key_by(lambda a: a[1]) \
       .reduce(lambda a, b: Row(a[0] + b[0], b[1])) \
       .add_sink(self.test_sink)
     self.env.execute('reduce_function_test')
     result = self.test_sink.get_results()
     expected = ["1,a", "3,a", "6,a", "4,b"]
     expected.sort()
     result.sort()
     self.assertEqual(expected, result)
Beispiel #3
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    type_info = Types.ROW_NAMED(
        ['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'],
        [Types.LONG(),
         Types.LONG(),
         Types.DOUBLE(),
         Types.INT(),
         Types.INT()])
    json_row_schema = JsonRowDeserializationSchema.builder().type_info(
        type_info).build()
    kafka_props = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'pyflink-e2e-source'
    }

    kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema,
                                        kafka_props)
    kafka_producer = FlinkKafkaProducer("timer-stream-sink",
                                        SimpleStringSchema(), kafka_props)

    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\
        .with_timestamp_assigner(KafkaRowTimestampAssigner())

    kafka_consumer.set_start_from_earliest()
    ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(
        watermark_strategy)
    ds.key_by(MyKeySelector(), key_type=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
Beispiel #4
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('string_array', DataTypes.ARRAY(DataTypes.STRING())),
        DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    conversion_row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.OBJECT_ARRAY(Types.STRING()),
        Types.OBJECT_ARRAY(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, conversion_row_type_info, data
 def test_from_collection_with_data_types(self):
     ds = self.env.from_collection(
         [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
         type_info=Types.ROW([Types.INT(),
                              Types.STRING(),
                              Types.STRING()]))
     collect_util = DataStreamCollectUtil()
     collect_util.collect(ds)
     self.env.execute("test from collection")
     results = collect_util.results()
     # if user specifies data types of input data, the collected result should be in row format.
     expected = ['1,Hi,Hello', '2,Hello,Hi']
     results.sort()
     expected.sort()
     self.assertEqual(expected, results)
Beispiel #6
0
    def test_map_function_with_data_types(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.TUPLE(
                                          [Types.STRING(),
                                           Types.INT()]))

        def map_func(value):
            result = (value[0], len(value[0]), value[1])
            return result

        mapped_stream = ds.map(map_func,
                               type_info=Types.ROW(
                                   [Types.STRING(),
                                    Types.INT(),
                                    Types.INT()]))
        collect_util = DataStreamCollectUtil()
        collect_util.collect(mapped_stream)
        self.env.execute('map_function_test')
        results = collect_util.results()
        expected = ['ab,2,1', 'bdc,3,2', 'cfgs,4,3', 'deeefg,6,4']
        expected.sort()
        results.sort()
        self.assertEqual(expected, results)
Beispiel #7
0
def connect_operators():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_python_executable(
        r"D:/ProgramData/Anaconda3/envs/penter/python.exe")
    ds1 = s_env.from_collection(
        [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))
    ds2 = s_env.from_collection(
        [(3, 'Hi2', 'Hello2'), (4, 'Hello2', 'Hi2')],
        type_info=Types.ROW([Types.INT(),
                             Types.STRING(),
                             Types.STRING()]))

    # Connect DataStream,DataStream → ConnectedStreams
    #cs = ds1.connect(ds2).map(MyCoMapFunction()) # , output_type=Types.INT()
    cs = ds1.connect(ds2).flat_map(
        MyCoFlatMapFunction())  # , output_type=Types.INT()
    cs.add_sink(
        StreamingFileSink.for_row_format('/tmp/output',
                                         SimpleStringEncoder()).build())
    print(s_env.get_execution_plan())
 def test_from_collection_with_data_types(self):
     ds = self.env.from_collection(
         [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
         type_info=Types.ROW([Types.INT(),
                              Types.STRING(),
                              Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.add_sink(test_sink)
     self.env.execute("test from collection")
     results = test_sink.get_results(False)
     # if user specifies data types of input data, the collected result should be in row format.
     expected = ['1,Hi,Hello', '2,Hello,Hi']
     results.sort()
     expected.sort()
     self.assertEqual(expected, results)
 def test_add_custom_source(self):
     custom_source = SourceFunction(
         "org.apache.flink.python.util.MyCustomSourceFunction")
     ds = self.env.add_source(custom_source,
                              type_info=Types.ROW(
                                  [Types.INT(), Types.STRING()]))
     ds.add_sink(self.test_sink)
     self.env.execute("test add custom source")
     results = self.test_sink.get_results(False)
     expected = [
         '+I[3, Mike]', '+I[1, Marry]', '+I[4, Ted]', '+I[5, Jack]',
         '+I[0, Bob]', '+I[2, Henry]'
     ]
     results.sort()
     expected.sort()
     self.assertEqual(expected, results)
Beispiel #10
0
def _create_parquet_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('binary', DataTypes.BINARY(10)),
        DataTypes.FIELD('varbinary', DataTypes.VARBINARY(10)),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')),
        DataTypes.FIELD('time', DataTypes.TIME().bridged_to('java.sql.Time')),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')),
        DataTypes.FIELD('timestamp_ltz', DataTypes.TIMESTAMP_LTZ(3)),
    ])
    row_type_info = Types.ROW_NAMED(
        ['char', 'varchar', 'binary', 'varbinary', 'boolean', 'decimal', 'int', 'bigint', 'double',
         'date', 'time', 'timestamp', 'timestamp_ltz'],
        [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()),
         Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(),
         Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(),
         Types.INSTANT()]
    )
    datetime_ltz = datetime.datetime(1970, 2, 3, 4, 5, 6, 700000, tzinfo=pytz.timezone('UTC'))
    timestamp_ltz = Instant.of_epoch_milli(
        (
            calendar.timegm(datetime_ltz.utctimetuple()) +
            calendar.timegm(time.localtime(0))
        ) * 1000 + datetime_ltz.microsecond // 1000
    )
    data = [Row(
        char='char',
        varchar='varchar',
        binary=b'binary',
        varbinary=b'varbinary',
        boolean=True,
        decimal=Decimal(1.5),
        int=2147483647,
        bigint=-9223372036854775808,
        double=2e-308,
        date=datetime.date(1970, 1, 1),
        time=datetime.time(1, 1, 1),
        timestamp=datetime.datetime(1970, 1, 2, 3, 4, 5, 600000),
        timestamp_ltz=timestamp_ltz
    )]
    return row_type, row_type_info, data
Beispiel #11
0
    def test_jdbc_sink(self):
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))
        jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\
            .with_driver_name('com.mysql.jdbc.Driver')\
            .with_user_name('root')\
            .with_password('password')\
            .with_url('jdbc:mysql://server-name:server-port/database-name').build()

        jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\
            .with_batch_size(100).with_max_retries(5).build()
        jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(),
                                  jdbc_connection_options,
                                  jdbc_execution_options)

        ds.add_sink(jdbc_sink).name('jdbc sink')
        plan = eval(self.env.get_execution_plan())
        self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type'])
        j_output_format = get_field_value(jdbc_sink.get_java_function(),
                                          'outputFormat')

        connection_options = JdbcConnectionOptions(
            get_field_value(
                get_field_value(j_output_format, 'connectionProvider'),
                'jdbcOptions'))
        self.assertEqual(jdbc_connection_options.get_db_url(),
                         connection_options.get_db_url())
        self.assertEqual(jdbc_connection_options.get_driver_name(),
                         connection_options.get_driver_name())
        self.assertEqual(jdbc_connection_options.get_password(),
                         connection_options.get_password())
        self.assertEqual(jdbc_connection_options.get_user_name(),
                         connection_options.get_user_name())

        exec_options = JdbcExecutionOptions(
            get_field_value(j_output_format, 'executionOptions'))
        self.assertEqual(jdbc_execution_options.get_batch_interval_ms(),
                         exec_options.get_batch_interval_ms())
        self.assertEqual(jdbc_execution_options.get_batch_size(),
                         exec_options.get_batch_size())
        self.assertEqual(jdbc_execution_options.get_max_retries(),
                         exec_options.get_max_retries())
Beispiel #12
0
    def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz):
        source_topic = 'test_source_topic'
        sink_topic = 'test_sink_topic'
        props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
        type_info = Types.ROW([Types.INT(), Types.STRING()])

        # Test for kafka consumer
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props)
        flink_kafka_consumer.set_start_from_earliest()
        flink_kafka_consumer.set_commit_offsets_on_checkpoints(True)

        j_properties = get_private_field(flink_kafka_consumer.get_java_function(), 'properties')
        self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_properties.getProperty('group.id'))
        self.assertTrue(get_private_field(flink_kafka_consumer.get_java_function(),
                                          'enableCommitOnCheckpoints'))
        j_start_up_mode = get_private_field(flink_kafka_consumer.get_java_function(), 'startupMode')

        j_deserializer = get_private_field(flink_kafka_consumer.get_java_function(), 'deserializer')
        j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType")
        deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info)
        self.assertTrue(deserialize_type_info == type_info)
        self.assertTrue(j_start_up_mode.equals(get_gateway().jvm
                                               .org.apache.flink.streaming.connectors
                                               .kafka.config.StartupMode.EARLIEST))
        j_topic_desc = get_private_field(flink_kafka_consumer.get_java_function(),
                                         'topicsDescriptor')
        j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics')
        self.assertEqual(['test_source_topic'], list(j_topics))

        # Test for kafka producer
        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props)
        flink_kafka_producer.set_write_timestamp_to_kafka(False)

        j_producer_config = get_private_field(flink_kafka_producer.get_java_function(),
                                              'producerConfig')
        self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers'))
        self.assertEqual('test_group', j_producer_config.getProperty('group.id'))
        self.assertFalse(get_private_field(flink_kafka_producer.get_java_function(),
                                           'writeTimestampToKafka'))
Beispiel #13
0
    def test_from_data_stream(self):
        self.env.set_parallelism(1)

        ds = self.env.from_collection([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
                                      type_info=Types.ROW([Types.INT(),
                                                           Types.STRING(),
                                                           Types.STRING()]))
        t_env = self.t_env
        table = t_env.from_data_stream(ds)
        field_names = ['a', 'b', 'c']
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink("Sink",
                                  source_sink_utils.TestAppendSink(field_names, field_types))
        t_env.insert_into("Sink", table)
        t_env.execute("test_from_data_stream")
        result = source_sink_utils.results()
        expected = ['1,Hi,Hello', '2,Hello,Hi']
        self.assert_equals(result, expected)
    def test_csv_row_serialization_schema(self):
        JRow = get_gateway().jvm.org.apache.flink.types.Row

        j_row = JRow(3)
        j_row.setField(0, "BEGIN")
        j_row.setField(2, "END")

        def field_assertion(field_info, csv_value, value, field_delimiter):
            row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()])
            expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n"
            j_row.setField(1, value)

            csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()
            csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\
                .set_escape_character('*').set_quote_character('\'')\
                .set_array_element_delimiter(':').set_field_delimiter(';').build()

            serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize(
                j_row)
            self.assertEqual(expected_csv,
                             str(serialized_bytes, encoding='utf-8'))

            j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\
                .deserialize(expected_csv.encode("utf-8"))
            self.assertTrue(j_row.equals(j_deserialized_row))

        field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";")
        field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";")
        field_assertion(Types.INT(), "12", 12, ";")

        test_j_row = JRow(2)
        test_j_row.setField(0, "1")
        test_j_row.setField(1, "hello")

        field_assertion(Types.ROW([Types.STRING(),
                                   Types.STRING()]), "'1:hello'", test_j_row,
                        ";")
        test_j_row.setField(1, "hello world")
        field_assertion(Types.ROW([Types.STRING(),
                                   Types.STRING()]), "'1:hello world'",
                        test_j_row, ";")
        field_assertion(Types.STRING(), "null", "null", ";")
Beispiel #15
0
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    jvm = get_gateway().jvm
    row_type = DataTypes.ROW([
        DataTypes.FIELD('char', DataTypes.CHAR(10)),
        DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)),
        DataTypes.FIELD('bytes', DataTypes.BYTES()),
        DataTypes.FIELD('boolean', DataTypes.BOOLEAN()),
        DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)),
        DataTypes.FIELD('int', DataTypes.INT()),
        DataTypes.FIELD('bigint', DataTypes.BIGINT()),
        DataTypes.FIELD('double', DataTypes.DOUBLE()),
        DataTypes.FIELD('date', DataTypes.DATE()),
        DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3)),
    ])
    row_type_info = Types.ROW_NAMED([
        'char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint',
        'double', 'date', 'timestamp'
    ], [
        Types.STRING(),
        Types.STRING(),
        Types.PRIMITIVE_ARRAY(Types.BYTE()),
        Types.BOOLEAN(),
        Types.BIG_DEC(),
        Types.INT(),
        Types.LONG(),
        Types.DOUBLE(),
        Types.JAVA(jvm.java.time.LocalTime),
        Types.JAVA(jvm.java.time.LocalDateTime)
    ])
    data = [
        Row(
            char='char',
            varchar='varchar',
            bytes=b'varbinary',
            boolean=True,
            decimal=Decimal(1.5),
            int=2147483647,
            bigint=-9223372036854775808,
            double=2e-308,
            date=date(1970, 1, 1),
            timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000),
        )
    ]
    return row_type, row_type_info, data
Beispiel #16
0
    def test_process_function(self):
        self.env.set_parallelism(1)
        self.env.get_config().set_auto_watermark_interval(2000)
        self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
        data_stream = self.env.from_collection(
            [(1, '1603708211000'), (2, '1603708224000'), (3, '1603708226000'),
             (4, '1603708289000')],
            type_info=Types.ROW([Types.INT(), Types.STRING()]))

        class MyTimestampAssigner(TimestampAssigner):
            def extract_timestamp(self, value, record_timestamp) -> int:
                return int(value[1])

        class MyProcessFunction(ProcessFunction):
            def process_element(self, value, ctx, out):
                current_timestamp = ctx.timestamp()
                current_watermark = ctx.timer_service().current_watermark()
                out.collect(
                    "current timestamp: {}, current watermark: {}, current_value: {}"
                    .format(str(current_timestamp), str(current_watermark),
                            str(value)))

            def on_timer(self, timestamp, ctx, out):
                pass

        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps()\
            .with_timestamp_assigner(MyTimestampAssigner())
        data_stream.assign_timestamps_and_watermarks(watermark_strategy)\
            .process(MyProcessFunction(), output_type=Types.STRING()).add_sink(self.test_sink)
        self.env.execute('test process function')
        result = self.test_sink.get_results()
        expected_result = [
            "current timestamp: 1603708211000, current watermark: "
            "9223372036854775807, current_value: <Row(1, '1603708211000')>",
            "current timestamp: 1603708224000, current watermark: "
            "9223372036854775807, current_value: <Row(2, '1603708224000')>",
            "current timestamp: 1603708226000, current watermark: "
            "9223372036854775807, current_value: <Row(3, '1603708226000')>",
            "current timestamp: 1603708289000, current watermark: "
            "9223372036854775807, current_value: <Row(4, '1603708289000')>"
        ]
        result.sort()
        expected_result.sort()
        self.assertEqual(expected_result, result)
    def test_add_jars(self):
        # find kafka connector jars
        flink_source_root = _find_flink_source_root()
        jars_abs_path = flink_source_root + '/flink-connectors/flink-sql-connector-kafka'
        specific_jars = glob.glob(jars_abs_path + '/target/flink*.jar')
        specific_jars = ['file://' + specific_jar for specific_jar in specific_jars]

        self.env.add_jars(*specific_jars)
        source_topic = 'test_source_topic'
        props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'}
        type_info = Types.ROW([Types.INT(), Types.STRING()])

        # Test for kafka consumer
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        # Will get a ClassNotFoundException if not add the kafka connector into the pipeline jars.
        kafka_consumer = FlinkKafkaConsumer(source_topic, deserialization_schema, props)
        self.env.add_source(kafka_consumer).print()
        self.env.get_execution_plan()
Beispiel #18
0
    def test_kinesis_firehose_sink(self):
        _load_specific_flink_module_jars(
            '/flink-connectors/'
            'flink-sql-connector-aws-kinesis-firehose')

        sink_properties = {
            'aws.region': 'eu-west-1',
            'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key'
        }

        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        kinesis_firehose_sink = KinesisFirehoseSink.builder() \
            .set_firehose_client_properties(sink_properties) \
            .set_serialization_schema(SimpleStringSchema()) \
            .set_delivery_stream_name('stream-1') \
            .set_fail_on_error(False) \
            .set_max_batch_size(500) \
            .set_max_in_flight_requests(50) \
            .set_max_buffered_requests(10000) \
            .set_max_batch_size_in_bytes(5 * 1024 * 1024) \
            .set_max_time_in_buffer_ms(5000) \
            .set_max_record_size_in_bytes(1 * 1024 * 1024) \
            .build()

        ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink')
        plan = eval(self.env.get_execution_plan())

        self.assertEqual('kinesis firehose sink: Writer',
                         plan['nodes'][1]['type'])
        self.assertEqual(
            get_field_value(kinesis_firehose_sink.get_java_function(),
                            'failOnError'), False)
        self.assertEqual(
            get_field_value(kinesis_firehose_sink.get_java_function(),
                            'deliveryStreamName'), 'stream-1')
Beispiel #19
0
    def test_json_row_serialization_deserialization_schema(self):
        jvm = get_gateway().jvm
        jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\", "
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}"
        ]
        expected_jsons = [
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\","
            "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},"
            "\"ids\":[1,2,3]}",
            "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}"
        ]

        row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [
            Types.STRING(),
            Types.ROW_NAMED(['id'], [Types.STRING()]),
            Types.PRIMITIVE_ARRAY(Types.INT())
        ])

        json_row_serialization_schema = JsonRowSerializationSchema.builder() \
            .with_type_info(row_schema).build()
        json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(row_schema).build()
        json_row_serialization_schema._j_serialization_schema.open(
            jvm.org.apache.flink.connector.testutils.formats.
            DummyInitializationContext())
        json_row_deserialization_schema._j_deserialization_schema.open(
            jvm.org.apache.flink.connector.testutils.formats.
            DummyInitializationContext())

        for i in range(len(jsons)):
            j_row = json_row_deserialization_schema._j_deserialization_schema\
                .deserialize(bytes(jsons[i], encoding='utf-8'))
            result = str(json_row_serialization_schema._j_serialization_schema.
                         serialize(j_row),
                         encoding='utf-8')
            self.assertEqual(expected_jsons[i], result)
Beispiel #20
0
    def test_tuple_type(self):
        self.assertEqual(TupleTypeInfo([Types.STRING(),
                                        Types.INT()]),
                         TupleTypeInfo([Types.STRING(),
                                        Types.INT()]), True)

        self.assertEqual(
            TupleTypeInfo([Types.STRING(), Types.INT()]).__str__(),
            "TupleTypeInfo(String, Integer)")

        self.assertNotEqual(TupleTypeInfo([Types.STRING(),
                                           Types.INT()]),
                            TupleTypeInfo([Types.STRING(),
                                           Types.BOOLEAN()]))

        self.assertEqual(Types.TUPLE([Types.STRING(),
                                      Types.INT()]),
                         TupleTypeInfo([Types.STRING(),
                                        Types.INT()]))

        self.assertEqual(
            Types.TUPLE([Types.STRING(), Types.INT()]).get_field_types(),
            [Types.STRING(), Types.INT()])
Beispiel #21
0
def popular_destination_query():
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    t_env.execute_sql(
        create_table_ddl(
            "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS"))

    query = f"""SELECT 
    destLocationId, wstart, wend, cnt 
FROM 
    (SELECT 
        destLocationId, 
        HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, 
        HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, 
        COUNT(destLocationId) AS cnt 
    FROM
        (SELECT 
            pickupTime, 
            destLocationId 
        FROM TaxiRide) 
    GROUP BY
        destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE)
    )
WHERE cnt > {args.threshold}
"""

    results = t_env.sql_query(query)

    t_env.to_append_stream(
        results,
        Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [
            Types.INT(),
            Types.SQL_TIMESTAMP(),
            Types.SQL_TIMESTAMP(),
            Types.LONG()
        ])).print()

    env.execute('Popular-Destination')
Beispiel #22
0
    def test_from_data_stream_with_schema(self):
        from pyflink.table import Schema

        ds = self.env.from_collection([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')],
                                      type_info=Types.ROW_NAMED(
                                          ["a", "b", "c"],
                                          [Types.INT(), Types.STRING(), Types.STRING()]))

        table = self.t_env.from_data_stream(ds,
                                            Schema.new_builder()
                                                  .column("a", DataTypes.INT())
                                                  .column("b", DataTypes.STRING())
                                                  .column("c", DataTypes.STRING())
                                                  .build())
        result = table.execute()
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [item for item in
                               map(str, [Row(1, 'Hi', 'Hello'), Row(2, 'Hello', 'Hi')])]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)
Beispiel #23
0
    def test_window_reduce_process(self):
        data_stream = self.env.from_collection(
            [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9),
             ('a', 15)],
            type_info=Types.TUPLE([Types.STRING(),
                                   Types.INT()]))  # type: DataStream
        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \
            .with_timestamp_assigner(SecondColumnTimestampAssigner())

        class MyProcessFunction(ProcessWindowFunction):
            def clear(self, context: ProcessWindowFunction.Context) -> None:
                pass

            def process(self, key, context: ProcessWindowFunction.Context,
                        elements: Iterable[Tuple[str, int]]) -> Iterable[str]:
                yield "current window start at {}, reduce result {}".format(
                    context.window().start,
                    next(iter(elements)),
                )

        data_stream.assign_timestamps_and_watermarks(watermark_strategy) \
            .key_by(lambda x: x[0], key_type=Types.STRING()) \
            .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \
            .reduce(lambda a, b: (b[0], a[1] + b[1]),
                    window_function=MyProcessFunction(),
                    output_type=Types.STRING()) \
            .add_sink(self.test_sink)

        self.env.execute('test_time_window_reduce_process')
        results = self.test_sink.get_results()
        expected = [
            "current window start at 1, reduce result ('a', 3)",
            "current window start at 15, reduce result ('a', 15)",
            "current window start at 3, reduce result ('b', 3)",
            "current window start at 6, reduce result ('a', 6)",
            "current window start at 8, reduce result ('b', 17)"
        ]
        self.assert_equals_sorted(expected, results)
Beispiel #24
0
    def test_row_type(self):
        self.assertEqual(
            RowTypeInfo([Types.STRING(), Types.STRING()]).get_field_names(),
            ['f0', 'f1'])
        self.assertEqual(
            RowTypeInfo([Types.STRING(), Types.STRING()],
                        ['a', 'b']).get_field_names(), ['a', 'b'])

        self.assertEqual(
            RowTypeInfo([Types.STRING(), Types.STRING()],
                        ['a', 'b']) == RowTypeInfo(
                            [Types.STRING(), Types.STRING()], ['a', 'b']),
            True)
        self.assertEqual(
            RowTypeInfo([Types.STRING(), Types.STRING()],
                        ['a', 'b']) == RowTypeInfo(
                            [Types.STRING(), Types.INT()], ['a', 'b']), False)
        self.assertEqual(
            RowTypeInfo([Types.STRING(), Types.STRING()],
                        ['a', 'b']).__str__(),
            "RowTypeInfo(a: String, b: String)")

        self.assertEqual(Types.ROW([Types.STRING(),
                                    Types.STRING()]),
                         RowTypeInfo([Types.STRING(),
                                      Types.STRING()]), True)

        self.assertEqual(
            Types.ROW_NAMED(
                ['a', 'b'],
                [Types.STRING(), Types.STRING()]).get_field_names(),
            ['a', 'b'], True)

        self.assertEqual(
            Types.ROW_NAMED(
                ['a', 'b'],
                [Types.STRING(), Types.STRING()]).get_field_types(),
            [Types.STRING(), Types.STRING()], True)
Beispiel #25
0
    def test_rabbitmq_connectors(self):
        connection_config = RMQConnectionConfig.Builder() \
            .set_host('localhost') \
            .set_port(5672) \
            .set_virtual_host('/') \
            .set_user_name('guest') \
            .set_password('guest') \
            .build()
        type_info = Types.ROW([Types.INT(), Types.STRING()])
        deserialization_schema = JsonRowDeserializationSchema.builder() \
            .type_info(type_info=type_info).build()

        rmq_source = RMQSource(
            connection_config, 'source_queue', True, deserialization_schema)
        self.assertEqual(
            get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue')
        self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId'))

        serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \
            .build()
        rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema)
        self.assertEqual(
            get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
Beispiel #26
0
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]:
    row_type = DataTypes.ROW([
        DataTypes.FIELD(
            'string_array',
            DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList')
        ),
        DataTypes.FIELD(
            'int_array',
            DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList')
        ),
    ])
    row_type_info = Types.ROW_NAMED([
        'string_array',
        'int_array',
    ], [
        Types.LIST(Types.STRING()),
        Types.LIST(Types.INT()),
    ])
    data = [Row(
        string_array=['a', 'b', 'c'],
        int_array=[1, 2, 3],
    )]
    return row_type, row_type_info, data
Beispiel #27
0
    def test_kinesis_streams_sink(self):
        sink_properties = {
            'aws.region': 'us-east-1',
            'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key'
        }

        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))

        kinesis_streams_sink = KinesisStreamsSink.builder() \
            .set_kinesis_client_properties(sink_properties) \
            .set_serialization_schema(SimpleStringSchema()) \
            .set_partition_key_generator(PartitionKeyGenerator.fixed()) \
            .set_stream_name("stream-1") \
            .set_fail_on_error(False) \
            .set_max_batch_size(500) \
            .set_max_in_flight_requests(50) \
            .set_max_buffered_requests(10000) \
            .set_max_batch_size_in_bytes(5 * 1024 * 1024) \
            .set_max_time_in_buffer_ms(5000) \
            .set_max_record_size_in_bytes(1 * 1024 * 1024) \
            .build()

        ds.sink_to(kinesis_streams_sink).name('kinesis streams sink')
        plan = eval(self.env.get_execution_plan())

        self.assertEqual('kinesis streams sink: Writer',
                         plan['nodes'][1]['type'])
        self.assertEqual(
            get_field_value(kinesis_streams_sink.get_java_function(),
                            'failOnError'), False)
        self.assertEqual(
            get_field_value(kinesis_streams_sink.get_java_function(),
                            'streamName'), 'stream-1')
Beispiel #28
0
    def test_stream_file_sink(self):
        self.env.set_parallelism(2)
        ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3),
                                       ('deeefg', 4)],
                                      type_info=Types.ROW(
                                          [Types.STRING(),
                                           Types.INT()]))
        ds.map(lambda a: a[0], Types.STRING()).add_sink(
            StreamingFileSink.for_row_format(
                self.tempdir,
                Encoder.simple_string_encoder()).with_rolling_policy(
                    RollingPolicy.default_rolling_policy(
                        part_size=1024 * 1024 * 1024,
                        rollover_interval=15 * 60 * 1000,
                        inactivity_interval=5 * 60 * 1000)).
            with_output_file_config(
                OutputFileConfig.OutputFileConfigBuilder().with_part_prefix(
                    "prefix").with_part_suffix("suffix").build()).build())

        self.env.execute("test_streaming_file_sink")

        results = []
        import os
        for root, dirs, files in os.walk(self.tempdir, topdown=True):
            for file in files:
                self.assertTrue(file.startswith('.prefix'))
                self.assertTrue('suffix' in file)
                path = root + "/" + file
                with open(path) as infile:
                    for line in infile:
                        results.append(line)

        expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n']
        results.sort()
        expected.sort()
        self.assertEqual(expected, results)
Beispiel #29
0
    def test_from_java_type(self):
        basic_int_type_info = Types.INT()
        self.assertEqual(basic_int_type_info,
                         _from_java_type(basic_int_type_info.get_java_type_info()))

        basic_short_type_info = Types.SHORT()
        self.assertEqual(basic_short_type_info,
                         _from_java_type(basic_short_type_info.get_java_type_info()))

        basic_long_type_info = Types.LONG()
        self.assertEqual(basic_long_type_info,
                         _from_java_type(basic_long_type_info.get_java_type_info()))

        basic_float_type_info = Types.FLOAT()
        self.assertEqual(basic_float_type_info,
                         _from_java_type(basic_float_type_info.get_java_type_info()))

        basic_double_type_info = Types.DOUBLE()
        self.assertEqual(basic_double_type_info,
                         _from_java_type(basic_double_type_info.get_java_type_info()))

        basic_char_type_info = Types.CHAR()
        self.assertEqual(basic_char_type_info,
                         _from_java_type(basic_char_type_info.get_java_type_info()))

        basic_byte_type_info = Types.BYTE()
        self.assertEqual(basic_byte_type_info,
                         _from_java_type(basic_byte_type_info.get_java_type_info()))

        basic_big_int_type_info = Types.BIG_INT()
        self.assertEqual(basic_big_int_type_info,
                         _from_java_type(basic_big_int_type_info.get_java_type_info()))

        basic_big_dec_type_info = Types.BIG_DEC()
        self.assertEqual(basic_big_dec_type_info,
                         _from_java_type(basic_big_dec_type_info.get_java_type_info()))

        basic_sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(basic_sql_date_type_info,
                         _from_java_type(basic_sql_date_type_info.get_java_type_info()))

        basic_sql_time_type_info = Types.SQL_TIME()
        self.assertEqual(basic_sql_time_type_info,
                         _from_java_type(basic_sql_time_type_info.get_java_type_info()))

        basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP()
        self.assertEqual(basic_sql_timestamp_type_info,
                         _from_java_type(basic_sql_timestamp_type_info.get_java_type_info()))

        row_type_info = Types.ROW([Types.INT(), Types.STRING()])
        self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info()))

        tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()])
        self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info()))

        primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT())
        self.assertEqual(primitive_int_array_type_info,
                         _from_java_type(primitive_int_array_type_info.get_java_type_info()))

        object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE())
        self.assertEqual(object_array_type_info,
                         _from_java_type(object_array_type_info.get_java_type_info()))

        pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY()
        self.assertEqual(pickled_byte_array_type_info,
                         _from_java_type(pickled_byte_array_type_info.get_java_type_info()))

        sql_date_type_info = Types.SQL_DATE()
        self.assertEqual(sql_date_type_info,
                         _from_java_type(sql_date_type_info.get_java_type_info()))

        map_type_info = Types.MAP(Types.INT(), Types.STRING())
        self.assertEqual(map_type_info,
                         _from_java_type(map_type_info.get_java_type_info()))

        list_type_info = Types.LIST(Types.INT())
        self.assertEqual(list_type_info,
                         _from_java_type(list_type_info.get_java_type_info()))
Beispiel #30
0
    def test_from_and_to_data_stream_event_time(self):
        from pyflink.table import Schema

        ds = self.env.from_collection(
            [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")],
            Types.ROW_NAMED(
                ["a", "b", "c"],
                [Types.LONG(), Types.INT(),
                 Types.STRING()]))
        ds = ds.assign_timestamps_and_watermarks(
            WatermarkStrategy.for_monotonous_timestamps(
            ).with_timestamp_assigner(MyTimestampAssigner()))

        table = self.t_env.from_data_stream(
            ds,
            Schema.new_builder().column_by_metadata(
                "rowtime",
                "TIMESTAMP_LTZ(3)").watermark("rowtime",
                                              "SOURCE_WATERMARK()").build())
        self.assertEqual(
            """(
  `a` BIGINT,
  `b` INT,
  `c` STRING,
  `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA,
  WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK()
)""",
            table._j_table.getResolvedSchema().toString())
        self.t_env.create_temporary_view(
            "t", ds,
            Schema.new_builder().column_by_metadata(
                "rowtime",
                "TIMESTAMP_LTZ(3)").watermark("rowtime",
                                              "SOURCE_WATERMARK()").build())

        result = self.t_env.execute_sql(
            "SELECT "
            "c, SUM(b) "
            "FROM t "
            "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)")
        with result.collect() as result:
            collected_result = [str(item) for item in result]
            expected_result = [
                item for item in map(
                    str, [Row('a', 47),
                          Row('c', 1000),
                          Row('c', 1000)])
            ]
            expected_result.sort()
            collected_result.sort()
            self.assertEqual(expected_result, collected_result)

        ds = self.t_env.to_data_stream(table)
        ds.key_by(lambda k: k.c, key_type=Types.STRING()) \
            .window(MyTumblingEventTimeWindow()) \
            .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \
            .add_sink(self.test_sink)
        self.env.execute()
        expected_results = ['(a,47)', '(c,1000)', '(c,1000)']
        actual_results = self.test_sink.get_results(False)
        expected_results.sort()
        actual_results.sort()
        self.assertEqual(expected_results, actual_results)