def test_event_time_sliding_window(self): data_stream = self.env.from_collection( [('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(SlidingEventTimeWindows.of(Time.milliseconds(5), Time.milliseconds(2))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_sliding_window') results = self.test_sink.get_results() expected = [ '(hi,-2,3,2)', '(hi,0,5,4)', '(hi,2,7,4)', '(hi,4,9,3)', '(hi,6,11,2)', '(hi,8,13,2)', '(hi,12,17,1)', '(hi,14,19,1)' ] self.assert_equals_sorted(expected, results)
def test_reduce_function_without_data_types(self): ds = self.env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.key_by(lambda a: a[1]) \ .reduce(lambda a, b: Row(a[0] + b[0], b[1])) \ .add_sink(self.test_sink) self.env.execute('reduce_function_test') result = self.test_sink.get_results() expected = ["1,a", "3,a", "6,a", "4,b"] expected.sort() result.sort() self.assertEqual(expected, result)
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED( ['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info( type_info).build() kafka_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source' } kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks( watermark_strategy) ds.key_by(MyKeySelector(), key_type=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('string_array', DataTypes.ARRAY(DataTypes.STRING())), DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT())), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) conversion_row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.OBJECT_ARRAY(Types.STRING()), Types.OBJECT_ARRAY(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, conversion_row_type_info, data
def test_from_collection_with_data_types(self): ds = self.env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) collect_util = DataStreamCollectUtil() collect_util.collect(ds) self.env.execute("test from collection") results = collect_util.results() # if user specifies data types of input data, the collected result should be in row format. expected = ['1,Hi,Hello', '2,Hello,Hi'] results.sort() expected.sort() self.assertEqual(expected, results)
def test_map_function_with_data_types(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) def map_func(value): result = (value[0], len(value[0]), value[1]) return result mapped_stream = ds.map(map_func, type_info=Types.ROW( [Types.STRING(), Types.INT(), Types.INT()])) collect_util = DataStreamCollectUtil() collect_util.collect(mapped_stream) self.env.execute('map_function_test') results = collect_util.results() expected = ['ab,2,1', 'bdc,3,2', 'cfgs,4,3', 'deeefg,6,4'] expected.sort() results.sort() self.assertEqual(expected, results)
def connect_operators(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_python_executable( r"D:/ProgramData/Anaconda3/envs/penter/python.exe") ds1 = s_env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) ds2 = s_env.from_collection( [(3, 'Hi2', 'Hello2'), (4, 'Hello2', 'Hi2')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) # Connect DataStream,DataStream → ConnectedStreams #cs = ds1.connect(ds2).map(MyCoMapFunction()) # , output_type=Types.INT() cs = ds1.connect(ds2).flat_map( MyCoFlatMapFunction()) # , output_type=Types.INT() cs.add_sink( StreamingFileSink.for_row_format('/tmp/output', SimpleStringEncoder()).build()) print(s_env.get_execution_plan())
def test_from_collection_with_data_types(self): ds = self.env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.add_sink(test_sink) self.env.execute("test from collection") results = test_sink.get_results(False) # if user specifies data types of input data, the collected result should be in row format. expected = ['1,Hi,Hello', '2,Hello,Hi'] results.sort() expected.sort() self.assertEqual(expected, results)
def test_add_custom_source(self): custom_source = SourceFunction( "org.apache.flink.python.util.MyCustomSourceFunction") ds = self.env.add_source(custom_source, type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.add_sink(self.test_sink) self.env.execute("test add custom source") results = self.test_sink.get_results(False) expected = [ '+I[3, Mike]', '+I[1, Marry]', '+I[4, Ted]', '+I[5, Jack]', '+I[0, Bob]', '+I[2, Henry]' ] results.sort() expected.sort() self.assertEqual(expected, results)
def _create_parquet_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('binary', DataTypes.BINARY(10)), DataTypes.FIELD('varbinary', DataTypes.VARBINARY(10)), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE().bridged_to('java.sql.Date')), DataTypes.FIELD('time', DataTypes.TIME().bridged_to('java.sql.Time')), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3).bridged_to('java.sql.Timestamp')), DataTypes.FIELD('timestamp_ltz', DataTypes.TIMESTAMP_LTZ(3)), ]) row_type_info = Types.ROW_NAMED( ['char', 'varchar', 'binary', 'varbinary', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'time', 'timestamp', 'timestamp_ltz'], [Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(), Types.INSTANT()] ) datetime_ltz = datetime.datetime(1970, 2, 3, 4, 5, 6, 700000, tzinfo=pytz.timezone('UTC')) timestamp_ltz = Instant.of_epoch_milli( ( calendar.timegm(datetime_ltz.utctimetuple()) + calendar.timegm(time.localtime(0)) ) * 1000 + datetime_ltz.microsecond // 1000 ) data = [Row( char='char', varchar='varchar', binary=b'binary', varbinary=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=datetime.date(1970, 1, 1), time=datetime.time(1, 1, 1), timestamp=datetime.datetime(1970, 1, 2, 3, 4, 5, 600000), timestamp_ltz=timestamp_ltz )] return row_type, row_type_info, data
def test_jdbc_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\ .with_driver_name('com.mysql.jdbc.Driver')\ .with_user_name('root')\ .with_password('password')\ .with_url('jdbc:mysql://server-name:server-port/database-name').build() jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\ .with_batch_size(100).with_max_retries(5).build() jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options, jdbc_execution_options) ds.add_sink(jdbc_sink).name('jdbc sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type']) j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat') connection_options = JdbcConnectionOptions( get_field_value( get_field_value(j_output_format, 'connectionProvider'), 'jdbcOptions')) self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url()) self.assertEqual(jdbc_connection_options.get_driver_name(), connection_options.get_driver_name()) self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password()) self.assertEqual(jdbc_connection_options.get_user_name(), connection_options.get_user_name()) exec_options = JdbcExecutionOptions( get_field_value(j_output_format, 'executionOptions')) self.assertEqual(jdbc_execution_options.get_batch_interval_ms(), exec_options.get_batch_interval_ms()) self.assertEqual(jdbc_execution_options.get_batch_size(), exec_options.get_batch_size()) self.assertEqual(jdbc_execution_options.get_max_retries(), exec_options.get_max_retries())
def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz): source_topic = 'test_source_topic' sink_topic = 'test_sink_topic' props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props) flink_kafka_consumer.set_start_from_earliest() flink_kafka_consumer.set_commit_offsets_on_checkpoints(True) j_properties = get_private_field(flink_kafka_consumer.get_java_function(), 'properties') self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_properties.getProperty('group.id')) self.assertTrue(get_private_field(flink_kafka_consumer.get_java_function(), 'enableCommitOnCheckpoints')) j_start_up_mode = get_private_field(flink_kafka_consumer.get_java_function(), 'startupMode') j_deserializer = get_private_field(flink_kafka_consumer.get_java_function(), 'deserializer') j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType") deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info) self.assertTrue(deserialize_type_info == type_info) self.assertTrue(j_start_up_mode.equals(get_gateway().jvm .org.apache.flink.streaming.connectors .kafka.config.StartupMode.EARLIEST)) j_topic_desc = get_private_field(flink_kafka_consumer.get_java_function(), 'topicsDescriptor') j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics') self.assertEqual(['test_source_topic'], list(j_topics)) # Test for kafka producer serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props) flink_kafka_producer.set_write_timestamp_to_kafka(False) j_producer_config = get_private_field(flink_kafka_producer.get_java_function(), 'producerConfig') self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_producer_config.getProperty('group.id')) self.assertFalse(get_private_field(flink_kafka_producer.get_java_function(), 'writeTimestampToKafka'))
def test_from_data_stream(self): self.env.set_parallelism(1) ds = self.env.from_collection([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) t_env = self.t_env table = t_env.from_data_stream(ds) field_names = ['a', 'b', 'c'] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] t_env.register_table_sink("Sink", source_sink_utils.TestAppendSink(field_names, field_types)) t_env.insert_into("Sink", table) t_env.execute("test_from_data_stream") result = source_sink_utils.results() expected = ['1,Hi,Hello', '2,Hello,Hi'] self.assert_equals(result, expected)
def test_csv_row_serialization_schema(self): JRow = get_gateway().jvm.org.apache.flink.types.Row j_row = JRow(3) j_row.setField(0, "BEGIN") j_row.setField(2, "END") def field_assertion(field_info, csv_value, value, field_delimiter): row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()]) expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n" j_row.setField(1, value) csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize( j_row) self.assertEqual(expected_csv, str(serialized_bytes, encoding='utf-8')) j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\ .deserialize(expected_csv.encode("utf-8")) self.assertTrue(j_row.equals(j_deserialized_row)) field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";") field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";") field_assertion(Types.INT(), "12", 12, ";") test_j_row = JRow(2) test_j_row.setField(0, "1") test_j_row.setField(1, "hello") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello'", test_j_row, ";") test_j_row.setField(1, "hello world") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello world'", test_j_row, ";") field_assertion(Types.STRING(), "null", "null", ";")
def _create_orc_basic_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: jvm = get_gateway().jvm row_type = DataTypes.ROW([ DataTypes.FIELD('char', DataTypes.CHAR(10)), DataTypes.FIELD('varchar', DataTypes.VARCHAR(10)), DataTypes.FIELD('bytes', DataTypes.BYTES()), DataTypes.FIELD('boolean', DataTypes.BOOLEAN()), DataTypes.FIELD('decimal', DataTypes.DECIMAL(2, 0)), DataTypes.FIELD('int', DataTypes.INT()), DataTypes.FIELD('bigint', DataTypes.BIGINT()), DataTypes.FIELD('double', DataTypes.DOUBLE()), DataTypes.FIELD('date', DataTypes.DATE()), DataTypes.FIELD('timestamp', DataTypes.TIMESTAMP(3)), ]) row_type_info = Types.ROW_NAMED([ 'char', 'varchar', 'bytes', 'boolean', 'decimal', 'int', 'bigint', 'double', 'date', 'timestamp' ], [ Types.STRING(), Types.STRING(), Types.PRIMITIVE_ARRAY(Types.BYTE()), Types.BOOLEAN(), Types.BIG_DEC(), Types.INT(), Types.LONG(), Types.DOUBLE(), Types.JAVA(jvm.java.time.LocalTime), Types.JAVA(jvm.java.time.LocalDateTime) ]) data = [ Row( char='char', varchar='varchar', bytes=b'varbinary', boolean=True, decimal=Decimal(1.5), int=2147483647, bigint=-9223372036854775808, double=2e-308, date=date(1970, 1, 1), timestamp=datetime(1970, 1, 2, 3, 4, 5, 600000), ) ] return row_type, row_type_info, data
def test_process_function(self): self.env.set_parallelism(1) self.env.get_config().set_auto_watermark_interval(2000) self.env.set_stream_time_characteristic(TimeCharacteristic.EventTime) data_stream = self.env.from_collection( [(1, '1603708211000'), (2, '1603708224000'), (3, '1603708226000'), (4, '1603708289000')], type_info=Types.ROW([Types.INT(), Types.STRING()])) class MyTimestampAssigner(TimestampAssigner): def extract_timestamp(self, value, record_timestamp) -> int: return int(value[1]) class MyProcessFunction(ProcessFunction): def process_element(self, value, ctx, out): current_timestamp = ctx.timestamp() current_watermark = ctx.timer_service().current_watermark() out.collect( "current timestamp: {}, current watermark: {}, current_value: {}" .format(str(current_timestamp), str(current_watermark), str(value))) def on_timer(self, timestamp, ctx, out): pass watermark_strategy = WatermarkStrategy.for_monotonous_timestamps()\ .with_timestamp_assigner(MyTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy)\ .process(MyProcessFunction(), output_type=Types.STRING()).add_sink(self.test_sink) self.env.execute('test process function') result = self.test_sink.get_results() expected_result = [ "current timestamp: 1603708211000, current watermark: " "9223372036854775807, current_value: <Row(1, '1603708211000')>", "current timestamp: 1603708224000, current watermark: " "9223372036854775807, current_value: <Row(2, '1603708224000')>", "current timestamp: 1603708226000, current watermark: " "9223372036854775807, current_value: <Row(3, '1603708226000')>", "current timestamp: 1603708289000, current watermark: " "9223372036854775807, current_value: <Row(4, '1603708289000')>" ] result.sort() expected_result.sort() self.assertEqual(expected_result, result)
def test_add_jars(self): # find kafka connector jars flink_source_root = _find_flink_source_root() jars_abs_path = flink_source_root + '/flink-connectors/flink-sql-connector-kafka' specific_jars = glob.glob(jars_abs_path + '/target/flink*.jar') specific_jars = ['file://' + specific_jar for specific_jar in specific_jars] self.env.add_jars(*specific_jars) source_topic = 'test_source_topic' props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() # Will get a ClassNotFoundException if not add the kafka connector into the pipeline jars. kafka_consumer = FlinkKafkaConsumer(source_topic, deserialization_schema, props) self.env.add_source(kafka_consumer).print() self.env.get_execution_plan()
def test_kinesis_firehose_sink(self): _load_specific_flink_module_jars( '/flink-connectors/' 'flink-sql-connector-aws-kinesis-firehose') sink_properties = { 'aws.region': 'eu-west-1', 'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) kinesis_firehose_sink = KinesisFirehoseSink.builder() \ .set_firehose_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_delivery_stream_name('stream-1') \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis firehose sink: Writer', plan['nodes'][1]['type']) self.assertEqual( get_field_value(kinesis_firehose_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_firehose_sink.get_java_function(), 'deliveryStreamName'), 'stream-1')
def test_json_row_serialization_deserialization_schema(self): jvm = get_gateway().jvm jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\", " "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}" ] expected_jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"," "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"}," "\"ids\":[1,2,3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}" ] row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [ Types.STRING(), Types.ROW_NAMED(['id'], [Types.STRING()]), Types.PRIMITIVE_ARRAY(Types.INT()) ]) json_row_serialization_schema = JsonRowSerializationSchema.builder() \ .with_type_info(row_schema).build() json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(row_schema).build() json_row_serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) json_row_deserialization_schema._j_deserialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) for i in range(len(jsons)): j_row = json_row_deserialization_schema._j_deserialization_schema\ .deserialize(bytes(jsons[i], encoding='utf-8')) result = str(json_row_serialization_schema._j_serialization_schema. serialize(j_row), encoding='utf-8') self.assertEqual(expected_jsons[i], result)
def test_tuple_type(self): self.assertEqual(TupleTypeInfo([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.INT()]), True) self.assertEqual( TupleTypeInfo([Types.STRING(), Types.INT()]).__str__(), "TupleTypeInfo(String, Integer)") self.assertNotEqual(TupleTypeInfo([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.BOOLEAN()])) self.assertEqual(Types.TUPLE([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.INT()])) self.assertEqual( Types.TUPLE([Types.STRING(), Types.INT()]).get_field_types(), [Types.STRING(), Types.INT()])
def popular_destination_query(): env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) t_env.execute_sql( create_table_ddl( "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS")) query = f"""SELECT destLocationId, wstart, wend, cnt FROM (SELECT destLocationId, HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, COUNT(destLocationId) AS cnt FROM (SELECT pickupTime, destLocationId FROM TaxiRide) GROUP BY destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) ) WHERE cnt > {args.threshold} """ results = t_env.sql_query(query) t_env.to_append_stream( results, Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [ Types.INT(), Types.SQL_TIMESTAMP(), Types.SQL_TIMESTAMP(), Types.LONG() ])).print() env.execute('Popular-Destination')
def test_from_data_stream_with_schema(self): from pyflink.table import Schema ds = self.env.from_collection([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW_NAMED( ["a", "b", "c"], [Types.INT(), Types.STRING(), Types.STRING()])) table = self.t_env.from_data_stream(ds, Schema.new_builder() .column("a", DataTypes.INT()) .column("b", DataTypes.STRING()) .column("c", DataTypes.STRING()) .build()) result = table.execute() with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [item for item in map(str, [Row(1, 'Hi', 'Hello'), Row(2, 'Hello', 'Hi')])] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result)
def test_window_reduce_process(self): data_stream = self.env.from_collection( [('a', 1), ('a', 2), ('b', 3), ('a', 6), ('b', 8), ('b', 9), ('a', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) class MyProcessFunction(ProcessWindowFunction): def clear(self, context: ProcessWindowFunction.Context) -> None: pass def process(self, key, context: ProcessWindowFunction.Context, elements: Iterable[Tuple[str, int]]) -> Iterable[str]: yield "current window start at {}, reduce result {}".format( context.window().start, next(iter(elements)), ) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_gap(Time.milliseconds(2))) \ .reduce(lambda a, b: (b[0], a[1] + b[1]), window_function=MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(self.test_sink) self.env.execute('test_time_window_reduce_process') results = self.test_sink.get_results() expected = [ "current window start at 1, reduce result ('a', 3)", "current window start at 15, reduce result ('a', 15)", "current window start at 3, reduce result ('b', 3)", "current window start at 6, reduce result ('a', 6)", "current window start at 8, reduce result ('b', 17)" ] self.assert_equals_sorted(expected, results)
def test_row_type(self): self.assertEqual( RowTypeInfo([Types.STRING(), Types.STRING()]).get_field_names(), ['f0', 'f1']) self.assertEqual( RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).get_field_names(), ['a', 'b']) self.assertEqual( RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo( [Types.STRING(), Types.STRING()], ['a', 'b']), True) self.assertEqual( RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo( [Types.STRING(), Types.INT()], ['a', 'b']), False) self.assertEqual( RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)") self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]), RowTypeInfo([Types.STRING(), Types.STRING()]), True) self.assertEqual( Types.ROW_NAMED( ['a', 'b'], [Types.STRING(), Types.STRING()]).get_field_names(), ['a', 'b'], True) self.assertEqual( Types.ROW_NAMED( ['a', 'b'], [Types.STRING(), Types.STRING()]).get_field_types(), [Types.STRING(), Types.STRING()], True)
def test_rabbitmq_connectors(self): connection_config = RMQConnectionConfig.Builder() \ .set_host('localhost') \ .set_port(5672) \ .set_virtual_host('/') \ .set_user_name('guest') \ .set_password('guest') \ .build() type_info = Types.ROW([Types.INT(), Types.STRING()]) deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() rmq_source = RMQSource( connection_config, 'source_queue', True, deserialization_schema) self.assertEqual( get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue') self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId')) serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema) self.assertEqual( get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD( 'string_array', DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList') ), DataTypes.FIELD( 'int_array', DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList') ), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, data
def test_kinesis_streams_sink(self): sink_properties = { 'aws.region': 'us-east-1', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) kinesis_streams_sink = KinesisStreamsSink.builder() \ .set_kinesis_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_partition_key_generator(PartitionKeyGenerator.fixed()) \ .set_stream_name("stream-1") \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_streams_sink).name('kinesis streams sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis streams sink: Writer', plan['nodes'][1]['type']) self.assertEqual( get_field_value(kinesis_streams_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_streams_sink.get_java_function(), 'streamName'), 'stream-1')
def test_stream_file_sink(self): self.env.set_parallelism(2) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) ds.map(lambda a: a[0], Types.STRING()).add_sink( StreamingFileSink.for_row_format( self.tempdir, Encoder.simple_string_encoder()).with_rolling_policy( RollingPolicy.default_rolling_policy( part_size=1024 * 1024 * 1024, rollover_interval=15 * 60 * 1000, inactivity_interval=5 * 60 * 1000)). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) self.env.execute("test_streaming_file_sink") results = [] import os for root, dirs, files in os.walk(self.tempdir, topdown=True): for file in files: self.assertTrue(file.startswith('.prefix')) self.assertTrue('suffix' in file) path = root + "/" + file with open(path) as infile: for line in infile: results.append(line) expected = ['deeefg\n', 'bdc\n', 'ab\n', 'cfgs\n'] results.sort() expected.sort() self.assertEqual(expected, results)
def test_from_java_type(self): basic_int_type_info = Types.INT() self.assertEqual(basic_int_type_info, _from_java_type(basic_int_type_info.get_java_type_info())) basic_short_type_info = Types.SHORT() self.assertEqual(basic_short_type_info, _from_java_type(basic_short_type_info.get_java_type_info())) basic_long_type_info = Types.LONG() self.assertEqual(basic_long_type_info, _from_java_type(basic_long_type_info.get_java_type_info())) basic_float_type_info = Types.FLOAT() self.assertEqual(basic_float_type_info, _from_java_type(basic_float_type_info.get_java_type_info())) basic_double_type_info = Types.DOUBLE() self.assertEqual(basic_double_type_info, _from_java_type(basic_double_type_info.get_java_type_info())) basic_char_type_info = Types.CHAR() self.assertEqual(basic_char_type_info, _from_java_type(basic_char_type_info.get_java_type_info())) basic_byte_type_info = Types.BYTE() self.assertEqual(basic_byte_type_info, _from_java_type(basic_byte_type_info.get_java_type_info())) basic_big_int_type_info = Types.BIG_INT() self.assertEqual(basic_big_int_type_info, _from_java_type(basic_big_int_type_info.get_java_type_info())) basic_big_dec_type_info = Types.BIG_DEC() self.assertEqual(basic_big_dec_type_info, _from_java_type(basic_big_dec_type_info.get_java_type_info())) basic_sql_date_type_info = Types.SQL_DATE() self.assertEqual(basic_sql_date_type_info, _from_java_type(basic_sql_date_type_info.get_java_type_info())) basic_sql_time_type_info = Types.SQL_TIME() self.assertEqual(basic_sql_time_type_info, _from_java_type(basic_sql_time_type_info.get_java_type_info())) basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP() self.assertEqual(basic_sql_timestamp_type_info, _from_java_type(basic_sql_timestamp_type_info.get_java_type_info())) row_type_info = Types.ROW([Types.INT(), Types.STRING()]) self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info())) tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()]) self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info())) primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT()) self.assertEqual(primitive_int_array_type_info, _from_java_type(primitive_int_array_type_info.get_java_type_info())) object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE()) self.assertEqual(object_array_type_info, _from_java_type(object_array_type_info.get_java_type_info())) pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY() self.assertEqual(pickled_byte_array_type_info, _from_java_type(pickled_byte_array_type_info.get_java_type_info())) sql_date_type_info = Types.SQL_DATE() self.assertEqual(sql_date_type_info, _from_java_type(sql_date_type_info.get_java_type_info())) map_type_info = Types.MAP(Types.INT(), Types.STRING()) self.assertEqual(map_type_info, _from_java_type(map_type_info.get_java_type_info())) list_type_info = Types.LIST(Types.INT()) self.assertEqual(list_type_info, _from_java_type(list_type_info.get_java_type_info()))
def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection( [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream( ds, Schema.new_builder().column_by_metadata( "rowtime", "TIMESTAMP_LTZ(3)").watermark("rowtime", "SOURCE_WATERMARK()").build()) self.assertEqual( """( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view( "t", ds, Schema.new_builder().column_by_metadata( "rowtime", "TIMESTAMP_LTZ(3)").watermark("rowtime", "SOURCE_WATERMARK()").build()) result = self.t_env.execute_sql( "SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [ item for item in map( str, [Row('a', 47), Row('c', 1000), Row('c', 1000)]) ] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)