def test_json_row_serialization_deserialization_schema(self): jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\", " "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}" ] expected_jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"," "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"}," "\"ids\":[1,2,3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}" ] row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [ Types.STRING(), Types.ROW_NAMED(['id'], [Types.STRING()]), Types.PRIMITIVE_ARRAY(Types.INT()) ]) json_row_serialization_schema = JsonRowSerializationSchema.builder() \ .with_type_info(row_schema).build() json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(row_schema).build() for i in range(len(jsons)): j_row = json_row_deserialization_schema._j_deserialization_schema\ .deserialize(bytes(jsons[i], encoding='utf-8')) result = str(json_row_serialization_schema._j_serialization_schema. serialize(j_row), encoding='utf-8') self.assertEqual(expected_jsons[i], result)
def test_rabbitmq_connectors(self): connection_config = RMQConnectionConfig.Builder() \ .set_host('localhost') \ .set_port(5672) \ .set_virtual_host('/') \ .set_user_name('guest') \ .set_password('guest') \ .build() type_info = Types.ROW([Types.INT(), Types.STRING()]) deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() rmq_source = RMQSource(connection_config, 'source_queue', True, deserialization_schema) self.assertEqual( get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue') self.assertTrue( get_field_value(rmq_source.get_java_function(), 'usesCorrelationId')) serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema) self.assertEqual( get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
def _check_record(data, topic, serialized_data): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic_selector(_select) \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() jvm = get_gateway().jvm serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext(), jvm.org.apache.flink.connector.kafka.sink. DefaultKafkaSinkContext(0, 1, jvm.java.util.Properties())) sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_record_serializer(serialization_schema) \ .build() ds = MockDataStream(Types.ROW([Types.STRING()])) ds.sink_to(sink) row = Row(data) topic_row = ds.feed(row) # type: Row j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(topic_row), None, None) self.assertEqual(j_record.topic(), topic) self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), serialized_data)
def test_set_topic(self): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(Row('test')), None, None) self.assertEqual(j_record.topic(), 'test-topic') self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), b'{"f0":"test"}')
def sink_to_kafka(self, topic: str, table: Table, typeinfo: TypeInformation, pool_size: int = 1) -> None: builder = JsonRowSerializationSchema.builder() builder.with_type_info(typeinfo) stream = self.table_env.to_retract_stream(table, typeinfo) stream = stream.filter(lambda x: x[0])\ .map(lambda x: x[1], output_type=typeinfo) stream.add_sink( FlinkKafkaProducer( topic, builder.build(), producer_config={'bootstrap.servers': self.kafka_addr}, kafka_producer_pool_size=pool_size))
def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz): source_topic = 'test_source_topic' sink_topic = 'test_sink_topic' props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props) flink_kafka_consumer.set_start_from_earliest() flink_kafka_consumer.set_commit_offsets_on_checkpoints(True) j_properties = get_private_field(flink_kafka_consumer.get_java_function(), 'properties') self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_properties.getProperty('group.id')) self.assertTrue(get_private_field(flink_kafka_consumer.get_java_function(), 'enableCommitOnCheckpoints')) j_start_up_mode = get_private_field(flink_kafka_consumer.get_java_function(), 'startupMode') j_deserializer = get_private_field(flink_kafka_consumer.get_java_function(), 'deserializer') j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType") deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info) self.assertTrue(deserialize_type_info == type_info) self.assertTrue(j_start_up_mode.equals(get_gateway().jvm .org.apache.flink.streaming.connectors .kafka.config.StartupMode.EARLIEST)) j_topic_desc = get_private_field(flink_kafka_consumer.get_java_function(), 'topicsDescriptor') j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics') self.assertEqual(['test_source_topic'], list(j_topics)) # Test for kafka producer serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props) flink_kafka_producer.set_write_timestamp_to_kafka(False) j_producer_config = get_private_field(flink_kafka_producer.get_java_function(), 'producerConfig') self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_producer_config.getProperty('group.id')) self.assertFalse(get_private_field(flink_kafka_producer.get_java_function(), 'writeTimestampToKafka'))
def test_set_topic(self): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() jvm = get_gateway().jvm serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext(), jvm.org.apache.flink.connector.kafka.sink.DefaultKafkaSinkContext( 0, 1, jvm.java.util.Properties())) j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(Row('test')), None, None) self.assertEqual(j_record.topic(), 'test-topic') self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), b'{"f0":"test"}')
def _check_record(data, topic, serialized_data): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic_selector(_select) \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() sink = KafkaSink.builder() \ .set_bootstrap_servers('localhost:9092') \ .set_record_serializer(serialization_schema) \ .build() ds = MockDataStream(Types.ROW([Types.STRING()])) ds.sink_to(sink) row = Row(data) topic_row = ds.feed(row) # type: Row j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(topic_row), None, None) self.assertEqual(j_record.topic(), topic) self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), serialized_data)
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() source_type_info = Types.ROW([Types.STRING(), Types.INT()]) json_row_deserialization_schema = JsonRowDeserializationSchema.builder()\ .type_info(source_type_info).build() source_topic = 'test-python-data-stream-source' consumer_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source' } kafka_consumer_1 = FlinkKafkaConsumer(source_topic, json_row_deserialization_schema, consumer_props) kafka_consumer_1.set_start_from_earliest() source_stream_1 = env.add_source(kafka_consumer_1).name('kafka source 1') mapped_type_info = Types.ROW([Types.STRING(), Types.INT(), Types.INT()]) keyed_stream = source_stream_1.map(add_one, output_type=mapped_type_info) \ .key_by(lambda x: x[2]) flat_mapped_stream = keyed_stream.flat_map(m_flat_map, result_type=mapped_type_info) flat_mapped_stream.name("flat-map").set_parallelism(3) sink_topic = 'test-python-data-stream-sink' producer_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-1' } json_row_serialization_schema = JsonRowSerializationSchema.builder()\ .with_type_info(mapped_type_info).build() kafka_producer = FlinkKafkaProducer( topic=sink_topic, producer_config=producer_props, serialization_schema=json_row_serialization_schema) flat_mapped_stream.add_sink(kafka_producer) env.execute_async("test data stream to kafka")
def _check_serialization_schema_implementations(check_function): input_type = Types.ROW([Types.STRING()]) check_function( JsonRowSerializationSchema.builder().with_type_info( input_type).build(), 'org.apache.flink.formats.json.JsonRowSerializationSchema') check_function( CsvRowSerializationSchema.Builder(input_type).build(), 'org.apache.flink.formats.csv.CsvRowSerializationSchema') avro_schema_string = """ { "type": "record", "name": "test_record", "fields": [] } """ check_function( AvroRowSerializationSchema(avro_schema_string=avro_schema_string), 'org.apache.flink.formats.avro.AvroRowSerializationSchema') check_function( SimpleStringSchema(), 'org.apache.flink.api.common.serialization.SimpleStringSchema')