def test_execute_and_collect(self): test_data = ['pyflink', 'datastream', 'execute', 'collect'] ds = self.env.from_collection(test_data) expected = test_data[:3] actual = [] for result in ds.execute_and_collect(limit=3): actual.append(result) self.assertEqual(expected, actual) expected = test_data ds = self.env.from_collection(collection=test_data, type_info=Types.STRING()) with ds.execute_and_collect() as results: actual = [] for result in results: actual.append(result) self.assertEqual(expected, actual) test_data = [ (1, None, 1, True, 32767, -2147483648, 1.23, 1.98932, bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13), datetime.time(hour=12, minute=0, second=0, microsecond=123000), datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [1, 2, 3], [['pyflink', 'datastream'], ['execute', 'collect']], decimal.Decimal('1000000000000000000.05'), decimal.Decimal('1000000000000000000.0599999999999' '9999899999999999')), (2, None, 2, True, 23878, 652516352, 9.87, 2.98936, bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14), datetime.time(hour=11, minute=2, second=2, microsecond=234500), datetime.datetime(2020, 4, 15, 8, 2, 6, 235000), [2, 4, 6], [['pyflink', 'datastream'], ['execute', 'collect']], decimal.Decimal('2000000000000000000.74'), decimal.Decimal('2000000000000000000.061111111111111' '11111111111111')) ] expected = test_data ds = self.env.from_collection(test_data) with ds.execute_and_collect() as results: actual = [] for result in results: actual.append(result) self.assertEqual(expected, actual)
def test_jdbc_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\ .with_driver_name('com.mysql.jdbc.Driver')\ .with_user_name('root')\ .with_password('password')\ .with_url('jdbc:mysql://server-name:server-port/database-name').build() jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\ .with_batch_size(100).with_max_retries(5).build() jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options, jdbc_execution_options) ds.add_sink(jdbc_sink).name('jdbc sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type']) j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat') connection_options = JdbcConnectionOptions( get_field_value( get_field_value(j_output_format, 'connectionProvider'), 'jdbcOptions')) self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url()) self.assertEqual(jdbc_connection_options.get_driver_name(), connection_options.get_driver_name()) self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password()) self.assertEqual(jdbc_connection_options.get_user_name(), connection_options.get_user_name()) exec_options = JdbcExecutionOptions( get_field_value(j_output_format, 'executionOptions')) self.assertEqual(jdbc_execution_options.get_batch_interval_ms(), exec_options.get_batch_interval_ms()) self.assertEqual(jdbc_execution_options.get_batch_size(), exec_options.get_batch_size()) self.assertEqual(jdbc_execution_options.get_max_retries(), exec_options.get_max_retries())
def kafka_connector_assertion(self, flink_kafka_consumer_clz, flink_kafka_producer_clz): source_topic = 'test_source_topic' sink_topic = 'test_sink_topic' props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() flink_kafka_consumer = flink_kafka_consumer_clz(source_topic, deserialization_schema, props) flink_kafka_consumer.set_start_from_earliest() flink_kafka_consumer.set_commit_offsets_on_checkpoints(True) j_properties = get_private_field(flink_kafka_consumer.get_java_function(), 'properties') self.assertEqual('localhost:9092', j_properties.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_properties.getProperty('group.id')) self.assertTrue(get_private_field(flink_kafka_consumer.get_java_function(), 'enableCommitOnCheckpoints')) j_start_up_mode = get_private_field(flink_kafka_consumer.get_java_function(), 'startupMode') j_deserializer = get_private_field(flink_kafka_consumer.get_java_function(), 'deserializer') j_deserialize_type_info = invoke_java_object_method(j_deserializer, "getProducedType") deserialize_type_info = typeinfo._from_java_type(j_deserialize_type_info) self.assertTrue(deserialize_type_info == type_info) self.assertTrue(j_start_up_mode.equals(get_gateway().jvm .org.apache.flink.streaming.connectors .kafka.config.StartupMode.EARLIEST)) j_topic_desc = get_private_field(flink_kafka_consumer.get_java_function(), 'topicsDescriptor') j_topics = invoke_java_object_method(j_topic_desc, 'getFixedTopics') self.assertEqual(['test_source_topic'], list(j_topics)) # Test for kafka producer serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() flink_kafka_producer = flink_kafka_producer_clz(sink_topic, serialization_schema, props) flink_kafka_producer.set_write_timestamp_to_kafka(False) j_producer_config = get_private_field(flink_kafka_producer.get_java_function(), 'producerConfig') self.assertEqual('localhost:9092', j_producer_config.getProperty('bootstrap.servers')) self.assertEqual('test_group', j_producer_config.getProperty('group.id')) self.assertFalse(get_private_field(flink_kafka_producer.get_java_function(), 'writeTimestampToKafka'))
def test_tuple_type(self): self.assertEqual(TupleTypeInfo([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.INT()]), True) self.assertEqual( TupleTypeInfo([Types.STRING(), Types.INT()]).__str__(), "TupleTypeInfo(String, Integer)") self.assertNotEqual(TupleTypeInfo([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.BOOLEAN()])) self.assertEqual(Types.TUPLE([Types.STRING(), Types.INT()]), TupleTypeInfo([Types.STRING(), Types.INT()])) self.assertEqual( Types.TUPLE([Types.STRING(), Types.INT()]).get_field_types(), [Types.STRING(), Types.INT()])
def _align_output_type(self) -> 'DataStream': """ Transform the pickled python object into String if the output type is PickledByteArrayInfo. """ output_type_info_class = self._j_data_stream.getTransformation().getOutputType().getClass() if output_type_info_class.isAssignableFrom( PickledBytesTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO().get_java_type_info() .getClass()): def python_obj_to_str_map_func(value): if not isinstance(value, (str, bytes)): value = str(value) return value transformed_data_stream = DataStream(self.map(python_obj_to_str_map_func, type_info=Types.STRING())._j_data_stream) return transformed_data_stream else: return self
def test_map_function_without_data_types(self): self.env.set_parallelism(1) ds = self.env.from_collection([('ab', decimal.Decimal(1)), ('bdc', decimal.Decimal(2)), ('cfgs', decimal.Decimal(3)), ('deeefg', decimal.Decimal(4))], type_info=Types.ROW( [Types.STRING(), Types.BIG_DEC()])) ds.map(MyMapFunction()).add_sink(self.test_sink) self.env.execute('map_function_test') results = self.test_sink.get_results(True) expected = [ "('ab', 2, Decimal('1'))", "('bdc', 3, Decimal('2'))", "('cfgs', 4, Decimal('3'))", "('deeefg', 6, Decimal('4'))" ] expected.sort() results.sort() self.assertEqual(expected, results)
def test_csv_row_serialization_schema(self): JRow = get_gateway().jvm.org.apache.flink.types.Row j_row = JRow(3) j_row.setField(0, "BEGIN") j_row.setField(2, "END") def field_assertion(field_info, csv_value, value, field_delimiter): row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()]) expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n" j_row.setField(1, value) csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize( j_row) self.assertEqual(expected_csv, str(serialized_bytes, encoding='utf-8')) j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\ .deserialize(expected_csv.encode("utf-8")) self.assertTrue(j_row.equals(j_deserialized_row)) field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";") field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";") field_assertion(Types.INT(), "12", 12, ";") test_j_row = JRow(2) test_j_row.setField(0, "1") test_j_row.setField(1, "hello") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello'", test_j_row, ";") test_j_row.setField(1, "hello world") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello world'", test_j_row, ";") field_assertion(Types.STRING(), "null", "null", ";")
def test_event_time_tumbling_window_all(self): data_stream = self.env.from_collection( [('hi', 1), ('hello', 2), ('hi', 3), ('hello', 4), ('hello', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # type: DataStream watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(SecondColumnTimestampAssigner()) data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .window_all(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountAllWindowProcessFunction(), Types.TUPLE([Types.LONG(), Types.LONG(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute('test_event_time_tumbling_window_all') results = self.test_sink.get_results() expected = ['(0,5,4)', '(15,20,1)', '(5,10,3)'] self.assert_equals_sorted(expected, results)
def test_set_topic(self): input_type = Types.ROW([Types.STRING()]) serialization_schema = KafkaRecordSerializationSchema.builder() \ .set_topic('test-topic') \ .set_value_serialization_schema( JsonRowSerializationSchema.builder().with_type_info(input_type).build()) \ .build() jvm = get_gateway().jvm serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext(), jvm.org.apache.flink.connector.kafka.sink.DefaultKafkaSinkContext( 0, 1, jvm.java.util.Properties())) j_record = serialization_schema._j_serialization_schema.serialize( to_java_data_structure(Row('test')), None, None) self.assertEqual(j_record.topic(), 'test-topic') self.assertIsNone(j_record.key()) self.assertEqual(j_record.value(), b'{"f0":"test"}')
def test_add_jars(self): # find kafka connector jars flink_source_root = _find_flink_source_root() jars_abs_path = flink_source_root + '/flink-connectors/flink-sql-connector-kafka' specific_jars = glob.glob(jars_abs_path + '/target/flink*.jar') specific_jars = ['file://' + specific_jar for specific_jar in specific_jars] self.env.add_jars(*specific_jars) source_topic = 'test_source_topic' props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} type_info = Types.ROW([Types.INT(), Types.STRING()]) # Test for kafka consumer deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() # Will get a ClassNotFoundException if not add the kafka connector into the pipeline jars. kafka_consumer = FlinkKafkaConsumer(source_topic, deserialization_schema, props) self.env.add_source(kafka_consumer).print() self.env.get_execution_plan()
def test_kinesis_firehose_sink(self): _load_specific_flink_module_jars( '/flink-connectors/' 'flink-sql-connector-aws-kinesis-firehose') sink_properties = { 'aws.region': 'eu-west-1', 'aws.credentials.provider.basic.accesskeyid': 'aws_access_key_id', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) kinesis_firehose_sink = KinesisFirehoseSink.builder() \ .set_firehose_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_delivery_stream_name('stream-1') \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_firehose_sink).name('kinesis firehose sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis firehose sink: Writer', plan['nodes'][1]['type']) self.assertEqual( get_field_value(kinesis_firehose_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_firehose_sink.get_java_function(), 'deliveryStreamName'), 'stream-1')
def test_source_deprecated_method(self): test_option = ConfigOptions.key('pulsar.source.enableAutoAcknowledgeMessage') \ .boolean_type().no_default_value() pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics('ada') \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING(), None)) \ .set_subscription_name('ff') \ .set_config(test_option, True) \ .set_config_with_dict({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval'). long_type().no_default_value()._j_config_option), 1000)
def test_map_function_without_data_types(self): self.env.set_parallelism(1) ds = self.env.from_collection([('ab', decimal.Decimal(1)), ('bdc', decimal.Decimal(2)), ('cfgs', decimal.Decimal(3)), ('deeefg', decimal.Decimal(4))], type_info=Types.ROW( [Types.STRING(), Types.BIG_DEC()])) mapped_stream = ds.map(MyMapFunction()) collect_util = DataStreamCollectUtil() collect_util.collect(mapped_stream) self.env.execute('map_function_test') results = collect_util.results() expected = [ "('ab', 2, Decimal('1'))", "('bdc', 3, Decimal('2'))", "('cfgs', 4, Decimal('3'))", "('deeefg', 6, Decimal('4'))" ] expected.sort() results.sort() self.assertEqual(expected, results)
def test_to_retract_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")], ["a", "b", "c"]) new_table = table.group_by("c").select("a.sum, c as b") ds = t_env.to_retract_stream(table=new_table, type_info=Types.ROW( [Types.LONG(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.map(lambda x: x).add_sink(test_sink) self.env.execute("test_to_retract_stream") result = test_sink.get_results(True) expected = [ "(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))", "(True, Row(f0=2, f1='Hello'))" ] self.assertEqual(result, expected)
def test_rabbitmq_connectors(self): connection_config = RMQConnectionConfig.Builder() \ .set_host('localhost') \ .set_port(5672) \ .set_virtual_host('/') \ .set_user_name('guest') \ .set_password('guest') \ .build() type_info = Types.ROW([Types.INT(), Types.STRING()]) deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() rmq_source = RMQSource( connection_config, 'source_queue', True, deserialization_schema) self.assertEqual( get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue') self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId')) serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema) self.assertEqual( get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
def _create_parquet_array_row_and_data() -> Tuple[RowType, RowTypeInfo, List[Row]]: row_type = DataTypes.ROW([ DataTypes.FIELD( 'string_array', DataTypes.ARRAY(DataTypes.STRING()).bridged_to('java.util.ArrayList') ), DataTypes.FIELD( 'int_array', DataTypes.ARRAY(DataTypes.INT()).bridged_to('java.util.ArrayList') ), ]) row_type_info = Types.ROW_NAMED([ 'string_array', 'int_array', ], [ Types.LIST(Types.STRING()), Types.LIST(Types.INT()), ]) data = [Row( string_array=['a', 'b', 'c'], int_array=[1, 2, 3], )] return row_type, row_type_info, data
def _check_serialization_schema_implementations(check_function): input_type = Types.ROW([Types.STRING()]) check_function( JsonRowSerializationSchema.builder().with_type_info( input_type).build(), 'org.apache.flink.formats.json.JsonRowSerializationSchema') check_function( CsvRowSerializationSchema.Builder(input_type).build(), 'org.apache.flink.formats.csv.CsvRowSerializationSchema') avro_schema_string = """ { "type": "record", "name": "test_record", "fields": [] } """ check_function( AvroRowSerializationSchema(avro_schema_string=avro_schema_string), 'org.apache.flink.formats.avro.AvroRowSerializationSchema') check_function( SimpleStringSchema(), 'org.apache.flink.api.common.serialization.SimpleStringSchema')
def python_data_stream_example(): env = StreamExecutionEnvironment.get_execution_environment() # Set the parallelism to be one to make sure that all data including fired timer and normal data # are processed by the same worker and the collected result would be in order which is good for # assertion. env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) type_info = Types.ROW_NAMED( ['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'], [Types.LONG(), Types.LONG(), Types.DOUBLE(), Types.INT(), Types.INT()]) json_row_schema = JsonRowDeserializationSchema.builder().type_info( type_info).build() kafka_props = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source' } kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema, kafka_props) kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(), kafka_props) watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\ .with_timestamp_assigner(KafkaRowTimestampAssigner()) kafka_consumer.set_start_from_earliest() ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks( watermark_strategy) ds.key_by(MyKeySelector(), key_type=Types.LONG()) \ .process(MyProcessFunction(), output_type=Types.STRING()) \ .add_sink(kafka_producer) env.execute_async("test data stream timer")
def test_kinesis_streams_sink(self): sink_properties = { 'aws.region': 'us-east-1', 'aws.credentials.provider.basic.secretkey': 'aws_secret_access_key' } ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) kinesis_streams_sink = KinesisStreamsSink.builder() \ .set_kinesis_client_properties(sink_properties) \ .set_serialization_schema(SimpleStringSchema()) \ .set_partition_key_generator(PartitionKeyGenerator.fixed()) \ .set_stream_name("stream-1") \ .set_fail_on_error(False) \ .set_max_batch_size(500) \ .set_max_in_flight_requests(50) \ .set_max_buffered_requests(10000) \ .set_max_batch_size_in_bytes(5 * 1024 * 1024) \ .set_max_time_in_buffer_ms(5000) \ .set_max_record_size_in_bytes(1 * 1024 * 1024) \ .build() ds.sink_to(kinesis_streams_sink).name('kinesis streams sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('kinesis streams sink: Writer', plan['nodes'][1]['type']) self.assertEqual( get_field_value(kinesis_streams_sink.get_java_function(), 'failOnError'), False) self.assertEqual( get_field_value(kinesis_streams_sink.get_java_function(), 'streamName'), 'stream-1')
def test_from_java_type(self): basic_int_type_info = Types.INT() self.assertEqual(basic_int_type_info, _from_java_type(basic_int_type_info.get_java_type_info())) basic_short_type_info = Types.SHORT() self.assertEqual(basic_short_type_info, _from_java_type(basic_short_type_info.get_java_type_info())) basic_long_type_info = Types.LONG() self.assertEqual(basic_long_type_info, _from_java_type(basic_long_type_info.get_java_type_info())) basic_float_type_info = Types.FLOAT() self.assertEqual(basic_float_type_info, _from_java_type(basic_float_type_info.get_java_type_info())) basic_double_type_info = Types.DOUBLE() self.assertEqual(basic_double_type_info, _from_java_type(basic_double_type_info.get_java_type_info())) basic_char_type_info = Types.CHAR() self.assertEqual(basic_char_type_info, _from_java_type(basic_char_type_info.get_java_type_info())) basic_byte_type_info = Types.BYTE() self.assertEqual(basic_byte_type_info, _from_java_type(basic_byte_type_info.get_java_type_info())) basic_big_int_type_info = Types.BIG_INT() self.assertEqual(basic_big_int_type_info, _from_java_type(basic_big_int_type_info.get_java_type_info())) basic_big_dec_type_info = Types.BIG_DEC() self.assertEqual(basic_big_dec_type_info, _from_java_type(basic_big_dec_type_info.get_java_type_info())) basic_sql_date_type_info = Types.SQL_DATE() self.assertEqual(basic_sql_date_type_info, _from_java_type(basic_sql_date_type_info.get_java_type_info())) basic_sql_time_type_info = Types.SQL_TIME() self.assertEqual(basic_sql_time_type_info, _from_java_type(basic_sql_time_type_info.get_java_type_info())) basic_sql_timestamp_type_info = Types.SQL_TIMESTAMP() self.assertEqual(basic_sql_timestamp_type_info, _from_java_type(basic_sql_timestamp_type_info.get_java_type_info())) row_type_info = Types.ROW([Types.INT(), Types.STRING()]) self.assertEqual(row_type_info, _from_java_type(row_type_info.get_java_type_info())) tuple_type_info = Types.TUPLE([Types.CHAR(), Types.INT()]) self.assertEqual(tuple_type_info, _from_java_type(tuple_type_info.get_java_type_info())) primitive_int_array_type_info = Types.PRIMITIVE_ARRAY(Types.INT()) self.assertEqual(primitive_int_array_type_info, _from_java_type(primitive_int_array_type_info.get_java_type_info())) object_array_type_info = Types.OBJECT_ARRAY(Types.SQL_DATE()) self.assertEqual(object_array_type_info, _from_java_type(object_array_type_info.get_java_type_info())) pickled_byte_array_type_info = Types.PICKLED_BYTE_ARRAY() self.assertEqual(pickled_byte_array_type_info, _from_java_type(pickled_byte_array_type_info.get_java_type_info())) sql_date_type_info = Types.SQL_DATE() self.assertEqual(sql_date_type_info, _from_java_type(sql_date_type_info.get_java_type_info())) map_type_info = Types.MAP(Types.INT(), Types.STRING()) self.assertEqual(map_type_info, _from_java_type(map_type_info.get_java_type_info())) list_type_info = Types.LIST(Types.INT()) self.assertEqual(list_type_info, _from_java_type(list_type_info.get_java_type_info()))
def test_row_type(self): self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()]) .get_field_names(), ['f0', 'f1']) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).get_field_names(), ['a', 'b']) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']), True) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']) == RowTypeInfo([Types.STRING(), Types.INT()], ['a', 'b']), False) self.assertEqual(RowTypeInfo([Types.STRING(), Types.STRING()], ['a', 'b']).__str__(), "RowTypeInfo(a: String, b: String)") self.assertEqual(Types.ROW([Types.STRING(), Types.STRING()]), RowTypeInfo([Types.STRING(), Types.STRING()]), True) self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()]) .get_field_names(), ['a', 'b'], True) self.assertEqual(Types.ROW_NAMED(['a', 'b'], [Types.STRING(), Types.STRING()]) .get_field_types(), [Types.STRING(), Types.STRING()], True)
def ds_operators(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_python_executable( r"D:/ProgramData/Anaconda3/envs/penter/python.exe") ds = s_env.from_collection( [(1, 'Hi', 'Hello'), (2, 'Hello', 'Hi')], type_info=Types.ROW([Types.INT(), Types.STRING(), Types.STRING()])) """ map flat_map filter key_by DataStream → KeyedStream reduce KeyedStream → DataStream union DataStream* → DataStream connect DataStream,DataStream → ConnectedStreams 转换元组: project 分区: partition_custom 自定义分区 shuffle 随机分区 根据均匀分布随机划分元素。 rebalance 轮询分区 rescale 重新分区 broadcast 向每个分区广播元素 随意定制 process 只有在KeyedStream上应用ProcessFunction时,才可以访问键控状态和计时器TimerService(相当于java的windows)。 其它 start_new_chain disable_chaining slot_sharing_group """ ds.rescale() ds.map() ds.flat_map() ds.filter() # KeyBy DataStream → KeyedStream # Reduce KeyedStream → DataStream ds = s_env.from_collection([(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.ROW( [Types.INT(), Types.STRING()])) ds.key_by(lambda a: a[1]) \ .reduce(lambda a, b: Row(a[0] + b[0], b[1])) # 广播 ds.broadcast() # project 只有元组ds才可以 ds = s_env.from_collection([[1, 2, 3, 4], [5, 6, 7, 8]], type_info=Types.TUPLE([ Types.INT(), Types.INT(), Types.INT(), Types.INT() ])) # 输出元组的1,3索引 ds.project(1, 3).map(lambda x: (x[0], x[1] + 1)).add_sink() # 存储 ds.add_sink( StreamingFileSink.for_row_format( '/tmp/output', SimpleStringEncoder()).with_rolling_policy( DefaultRollingPolicy.builder().with_rollover_interval( 15 * 60 * 1000).with_inactivity_interval( 5 * 60 * 1000).with_max_part_size(1024 * 1024 * 1024).build()). with_output_file_config( OutputFileConfig.OutputFileConfigBuilder().with_part_prefix( "prefix").with_part_suffix("suffix").build()).build()) s_env.execute('ds_operators')
def test_from_and_to_data_stream_event_time(self): from pyflink.table import Schema ds = self.env.from_collection( [(1, 42, "a"), (2, 5, "a"), (3, 1000, "c"), (100, 1000, "c")], Types.ROW_NAMED( ["a", "b", "c"], [Types.LONG(), Types.INT(), Types.STRING()])) ds = ds.assign_timestamps_and_watermarks( WatermarkStrategy.for_monotonous_timestamps( ).with_timestamp_assigner(MyTimestampAssigner())) table = self.t_env.from_data_stream( ds, Schema.new_builder().column_by_metadata( "rowtime", "TIMESTAMP_LTZ(3)").watermark("rowtime", "SOURCE_WATERMARK()").build()) self.assertEqual( """( `a` BIGINT, `b` INT, `c` STRING, `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() )""", table._j_table.getResolvedSchema().toString()) self.t_env.create_temporary_view( "t", ds, Schema.new_builder().column_by_metadata( "rowtime", "TIMESTAMP_LTZ(3)").watermark("rowtime", "SOURCE_WATERMARK()").build()) result = self.t_env.execute_sql( "SELECT " "c, SUM(b) " "FROM t " "GROUP BY c, TUMBLE(rowtime, INTERVAL '0.005' SECOND)") with result.collect() as result: collected_result = [str(item) for item in result] expected_result = [ item for item in map( str, [Row('a', 47), Row('c', 1000), Row('c', 1000)]) ] expected_result.sort() collected_result.sort() self.assertEqual(expected_result, collected_result) ds = self.t_env.to_data_stream(table) ds.key_by(lambda k: k.c, key_type=Types.STRING()) \ .window(MyTumblingEventTimeWindow()) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) \ .add_sink(self.test_sink) self.env.execute() expected_results = ['(a,47)', '(c,1000)', '(c,1000)'] actual_results = self.test_sink.get_results(False) expected_results.sort() actual_results.sort() self.assertEqual(expected_results, actual_results)
def test_generate_stream_graph_with_dependencies(self): python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join( python_file_dir, "test_stream_dependency_manage_lib.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") env = self.env env.add_python_file(python_file_path) def plus_two_map(value): from test_stream_dependency_manage_lib import add_two return value[0], add_two(value[1]) def add_from_file(i): with open("data/data.txt", 'r') as f: return i[0], i[1] + int(f.read()) from_collection_source = env.from_collection( [('a', 0), ('b', 0), ('c', 1), ('d', 1), ('e', 2)], type_info=Types.ROW([Types.STRING(), Types.INT()])) from_collection_source.name("From Collection") keyed_stream = from_collection_source.key_by(lambda x: x[1], key_type=Types.INT()) plus_two_map_stream = keyed_stream.map(plus_two_map).name( "Plus Two Map").set_parallelism(3) add_from_file_map = plus_two_map_stream.map(add_from_file).name( "Add From File Map") test_stream_sink = add_from_file_map.add_sink( self.test_sink).name("Test Sink") test_stream_sink.set_parallelism(4) archive_dir_path = os.path.join(self.tempdir, "archive_" + str(uuid.uuid4())) os.mkdir(archive_dir_path) with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f: f.write("3") archive_file_path = \ shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path) env.add_python_archive(archive_file_path, "data") nodes = eval(env.get_execution_plan())['nodes'] # The StreamGraph should be as bellow: # Source: From Collection -> _stream_key_by_map_operator -> # Plus Two Map -> Add From File Map -> Sink: Test Sink. # Source: From Collection and _stream_key_by_map_operator should have same parallelism. self.assertEqual(nodes[0]['parallelism'], nodes[1]['parallelism']) # The parallelism of Plus Two Map should be 3 self.assertEqual(nodes[2]['parallelism'], 3) # The ship_strategy for Source: From Collection and _stream_key_by_map_operator should be # FORWARD self.assertEqual(nodes[1]['predecessors'][0]['ship_strategy'], "FORWARD") # The ship_strategy for _keyed_stream_values_operator and Plus Two Map should be # HASH self.assertEqual(nodes[2]['predecessors'][0]['ship_strategy'], "HASH") # The parallelism of Sink: Test Sink should be 4 self.assertEqual(nodes[4]['parallelism'], 4) python_dependency_config = dict( get_gateway().jvm.org.apache.flink.python.util. PythonDependencyUtils.configurePythonDependencies( env._j_stream_execution_environment.getCachedFiles(), env._j_stream_execution_environment.getConfiguration()).toMap( )) # Make sure that user specified files and archives are correctly added. self.assertIsNotNone( python_dependency_config['python.internal.files-key-map']) self.assertIsNotNone( python_dependency_config['python.internal.archives-key-map'])
def test_from_collection_with_data_types(self): # verify from_collection for the collection with single object. ds = self.env.from_collection(['Hi', 'Hello'], type_info=Types.STRING()) ds.add_sink(self.test_sink) self.env.execute("test from collection with single object") results = self.test_sink.get_results(False) expected = ['Hello', 'Hi'] results.sort() expected.sort() self.assertEqual(expected, results) # verify from_collection for the collection with multiple objects like tuple. ds = self.env.from_collection( [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932, bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13), datetime.time(hour=12, minute=0, second=0, microsecond=123000), datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [1, 2, 3], decimal.Decimal('1000000000000000000.05'), decimal.Decimal('1000000000000000000.0599999999999' '9999899999999999')), (2, None, 2, True, 43878, 9147483648, 9.87, 2.98936, bytearray(b'flink'), 'pyflink', datetime.date(2015, 10, 14), datetime.time(hour=11, minute=2, second=2, microsecond=234500), datetime.datetime(2020, 4, 15, 8, 2, 6, 235000), [2, 4, 6], decimal.Decimal('2000000000000000000.74'), decimal.Decimal('2000000000000000000.061111111111111' '11111111111111'))], type_info=Types.ROW([ Types.LONG(), Types.LONG(), Types.SHORT(), Types.BOOLEAN(), Types.SHORT(), Types.INT(), Types.FLOAT(), Types.DOUBLE(), Types.PICKLED_BYTE_ARRAY(), Types.STRING(), Types.SQL_DATE(), Types.SQL_TIME(), Types.SQL_TIMESTAMP(), Types.BASIC_ARRAY(Types.LONG()), Types.BIG_DEC(), Types.BIG_DEC() ])) ds.add_sink(self.test_sink) self.env.execute("test from collection with tuple object") results = self.test_sink.get_results(False) # if user specifies data types of input data, the collected result should be in row format. expected = [ '+I[1, null, 1, true, 32767, -2147483648, 1.23, 1.98932, [102, 108, 105, 110, 107], ' 'pyflink, 2014-09-13, 12:00:00, 2018-03-11 03:00:00.123, [1, 2, 3], ' '1000000000000000000.05, 1000000000000000000.05999999999999999899999999999]', '+I[2, null, 2, true, -21658, 557549056, 9.87, 2.98936, [102, 108, 105, 110, 107], ' 'pyflink, 2015-10-14, 11:02:02, 2020-04-15 08:02:06.235, [2, 4, 6], ' '2000000000000000000.74, 2000000000000000000.06111111111111111111111111111]' ] results.sort() expected.sort() self.assertEqual(expected, results)
def demo01(): # 创建一个执行环境,该环境表示程序当前正在执行。如果程序是独立调用的,则方法返回本地执行环境。 # 1:创建一个流处理的执行环境,如果在本地启动则创建本地执行环境,如果在集群启动则创建集群执行环境 env = StreamExecutionEnvironment.get_execution_environment() # 添加添加到程序的每个用户代码类加载器的类路径中的url列表。路径必须指定一个协议(例如file://),并且可以在所有节点上访问 env.add_classpaths("file://lib") # 添加将被上传到集群并由作业引用的jar文件列表。 .set_string("pipeline.jars", 'file://' + dir_kafka_sql_connect) env.add_jars("file://jars") # 添加python存档文件。该文件将被解压到python UDF worker的工作目录中。 # 目前只支持zip格式,例如zip、jar、whl、egg等 # 会先解压zip -r py_env.zip py_env.zip env.add_python_archive("py_env.zip") # 如果python UDF依赖于集群中不存在的特定python版本,则可以使用此方法上传虚拟环境。注意,上传环境中包含的python解释器的路径应该通过该方法指定 env.set_python_executable("py_env.zip/py_env/bin/python") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 # or env.add_python_archive("py_env.zip", "myenv") env.set_python_executable("myenv/py_env/bin/python") # the files contained in the archive file can be accessed in UDF """ def my_udf(): with open("myenv/py_env/data/data.txt") as f: ... """ # 相当于 pip download -d cached_dir -r requirements.txt --no-binary :all: env.set_python_requirements("requirements.txt", "cached_dir") # 添加一个python依赖项,它可以是python文件、python包或本地目录。它们将被添加到python UDF工作者的PYTHONPATH中。请确保可以导入这些依赖项。 env.add_python_file("") # 添加source #1. add_source ds = env.add_source( FlinkKafkaConsumer( "source_topic", JsonRowDeserializationSchema.builder().type_info( type_info=Types.ROW([Types.INT(), Types.STRING()])).build(), { 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' })) # 2. from_collection ds = env.from_collection([ 1, 2, 3, ], Types.INT()) # 3. 从文件 ds = env.read_text_file("hdfs://host:port/file/path") # 禁用operator chaining env.disable_operator_chaining() """ Flink 可以非常高效的进行有状态流的计算,通过使用 Flink 内置的 Keyed State 和 Operator State,保存每个算子的状态。 默认情况下,状态是存储在 JVM 的堆内存中,如果系统中某个环节发生了错误,宕机,这个时候所有的状态都会丢失,并且无法恢复,会导致整个系统的数据计算发生错误。 此时就需要 Checkpoint 来保障系统的容错。Checkpoint 过程,就是把算子的状态周期性持久化的过程。 在系统出错后恢复时,就可以从 checkpoint 中恢复每个算子的状态,从上次消费的地方重新开始消费和计算。从而可以做到在高效进行计算的同时还可以保证数据不丢失,只计算一次。 最少一次 AT_LEAST_ONCE 如果假定是传输过程出现问题,而服务器没有收到数据,这样time out之后重传数据。但这可能是返回成功消息的时候出问题,而此时服务器已经收到数据,这样会因为重传而收到多份数据,这就是 at least once 严格一次 EXACTLY_ONCE 最多一次(At-most-once)、最少一次(At-least-once),以及严格一次(Exactly-once) Checkpoint 必要的两个条件 1. 需要支持重放一定时间范围内数据的数据源,比如:kafka 。 因为容错机制就是在任务失败后自动从最近一次成功的 checkpoint 处恢复任务,此时需要把任务失败前消费的数据再消费一遍。 假设数据源不支持重放,那么数据还未写到存储中就丢了,任务恢复后,就再也无法重新消费这部分丢了的数据了。 2. 需要一个存储来保存持久化的状态,如:Hdfs,本地文件。可以在任务失败后,从存储中恢复 checkpoint 数据。 https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/stream/state/checkpointing.html https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.datastream.html#pyflink.datastream.CheckpointConfig """ # 每 300s 做一次 checkpoint env.enable_checkpointing(300000, CheckpointingMode.AT_LEAST_ONCE) # MemoryStateBackend FsStateBackend CustomStateBackend env.set_state_backend(RocksDBStateBackend("file://var/checkpoints/")) # set mode to exactly-once (this is the default) env.get_checkpoint_config().set_checkpointing_mode( CheckpointingMode.EXACTLY_ONCE) # 两次 checkpoint 的间隔时间至少为 500ms,默认是 0,立即进行下一次 checkpoint make sure 500 ms of progress happen between checkpoints env.get_checkpoint_config().set_min_pause_between_checkpoints(500) # checkpoint 必须在 60s 内结束,否则被丢弃 checkpoints have to complete within one minute, or are discarded env.get_checkpoint_config().set_checkpoint_timeout(60000) # 同一时间只能允许有一个 checkpoint allow only one checkpoint to be in progress at the same time env.get_checkpoint_config().set_max_concurrent_checkpoints(1) # 当 Flink 任务取消时,保留外部保存的 checkpoint 信息 enable externalized checkpoints which are retained after job cancellation env.get_checkpoint_config().enable_externalized_checkpoints( ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) # 当有较新的 Savepoint 时,作业也会从 Checkpoint 处恢复 allow job recovery fallback to checkpoint when there is a more recent savepoint env.get_checkpoint_config().set_prefer_checkpoint_for_recovery(True) # 允许实验性的功能:非对齐的 checkpoint,以提升性能 enables the experimental unaligned checkpoints # CheckpointingMode.EXACTLY_ONCE时才能启用 env.get_checkpoint_config().enable_unaligned_checkpoints() # env.get_checkpoint_config().disable_unaligned_checkpoints() 等同env.get_checkpoint_config().enable_unaligned_checkpoints(False) env.get_checkpoint_interval( ) #等同 env.get_checkpoint_config().get_checkpoint_interval() """ """ # https://ci.apache.org/projects/flink/flink-docs-release-1.12/api/python/pyflink.common.html#pyflink.common.ExecutionConfig # bin/flink run -Dexecution.runtime-mode=BATCH examples/streaming/WordCount.jar env.get_config().set_execution_mode(ExecutionMode.BATCH) env.get_config().disable_auto_generated_uids( ) # enable_auto_generated_uids # 自己设置uid ds.uid("xx") # 设置从此环境创建的所有流的时间特性,例如,处理时间、事件时间或摄取时间。 # 如果将特征设置为EventTime的incertiontime,则将设置默认值水印更新间隔为200毫秒。 env.set_stream_time_characteristic(TimeCharacteristic.EventTime) #设置时间分配器 env.get_config().set_auto_watermark_interval(200) # 每200ms发出一个watermark env.get_config().set_global_job_parameters( {"environment.checkpoint_interval": "1000"}) env.get_config().set_restart_strategy( RestartStrategies.fixed_delay_restart(10, 1000)) # 执行 env.execute("job name") # 异步执行 jobClient = env.execute_async("job name") jobClient.get_job_execution_result().result() """ 设置输出缓冲区刷新的最大时间频率(毫秒)。默认情况下,输出缓冲区会频繁刷新,以提供较低的延迟,并帮助流畅的开发人员体验。设置该参数可以产生三种逻辑模式: 正整数触发该整数周期性刷新 0 触发每个记录之后的刷新,从而最大限度地减少延迟(最好不要设置为0 可以设置一个接近0的数值,比如5或者10) -1 仅在输出缓冲区已满时才触发刷新,从而最大化吞吐量 """ # 输出缓冲区刷新的最大时间频率(毫秒) env.get_buffer_timeout() env.set_buffer_timeout(10) # 获取执行计划的json,复制到https://flink.apache.org/visualizer/ env.get_execution_plan()
def test_pulsar_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled' pulsar_sink = PulsarSink.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_producer_name('fo') \ .set_topics('ada') \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \ .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \ .build() ds.sink_to(pulsar_sink).name('pulsar sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type']) configuration = get_field_value(pulsar_sink.get_java_function(), "sinkConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl').string_type(). no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl').string_type(). no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.producer.producerName').string_type( ).no_default_value()._j_config_option), 'fo - %s') j_pulsar_serialization_schema = get_field_value( pulsar_sink.get_java_function(), 'serializationSchema') j_serialization_schema = get_field_value(j_pulsar_serialization_schema, 'serializationSchema') self.assertTrue( is_instance_of( j_serialization_schema, 'org.apache.flink.api.common.serialization.SimpleStringSchema') ) self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type( ).no_default_value()._j_config_option), 'at-least-once') j_topic_router = get_field_value(pulsar_sink.get_java_function(), "topicRouter") self.assertTrue( is_instance_of( j_topic_router, 'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter' )) j_message_delayer = get_field_value(pulsar_sink.get_java_function(), 'messageDelayer') delay_duration = get_field_value(j_message_delayer, 'delayDuration') self.assertEqual(delay_duration, 12000) test_option = ConfigOptions.key( TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.producer.batchingMaxMessages'). long_type().no_default_value()._j_config_option), 100)
def test_es_sink(self): ds = self.env.from_collection([{ 'name': 'ada', 'id': '1' }, { 'name': 'luna', 'id': '2' }], type_info=Types.MAP( Types.STRING(), Types.STRING())) es_sink = Elasticsearch7SinkBuilder() \ .set_emitter(ElasticsearchEmitter.static_index('foo', 'id')) \ .set_hosts(['localhost:9200']) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_bulk_flush_max_actions(1) \ .set_bulk_flush_max_size_mb(2) \ .set_bulk_flush_interval(1000) \ .set_bulk_flush_backoff_strategy(FlushBackoffType.CONSTANT, 3, 3000) \ .set_connection_username('foo') \ .set_connection_password('bar') \ .set_connection_path_prefix('foo-bar') \ .set_connection_request_timeout(30000) \ .set_connection_timeout(31000) \ .set_socket_timeout(32000) \ .build() j_emitter = get_field_value(es_sink.get_java_function(), 'emitter') self.assertTrue( is_instance_of( j_emitter, 'org.apache.flink.connector.elasticsearch.sink.SimpleElasticsearchEmitter' )) self.assertEqual( get_field_value(es_sink.get_java_function(), 'hosts')[0].toString(), 'http://localhost:9200') self.assertEqual( get_field_value(es_sink.get_java_function(), 'deliveryGuarantee').toString(), 'at-least-once') j_build_bulk_processor_config = get_field_value( es_sink.get_java_function(), 'buildBulkProcessorConfig') self.assertEqual( j_build_bulk_processor_config.getBulkFlushMaxActions(), 1) self.assertEqual(j_build_bulk_processor_config.getBulkFlushMaxMb(), 2) self.assertEqual(j_build_bulk_processor_config.getBulkFlushInterval(), 1000) self.assertEqual( j_build_bulk_processor_config.getFlushBackoffType().toString(), 'CONSTANT') self.assertEqual( j_build_bulk_processor_config.getBulkFlushBackoffRetries(), 3) self.assertEqual( j_build_bulk_processor_config.getBulkFlushBackOffDelay(), 3000) j_network_client_config = get_field_value(es_sink.get_java_function(), 'networkClientConfig') self.assertEqual(j_network_client_config.getUsername(), 'foo') self.assertEqual(j_network_client_config.getPassword(), 'bar') self.assertEqual(j_network_client_config.getConnectionRequestTimeout(), 30000) self.assertEqual(j_network_client_config.getConnectionTimeout(), 31000) self.assertEqual(j_network_client_config.getSocketTimeout(), 32000) self.assertEqual(j_network_client_config.getConnectionPathPrefix(), 'foo-bar') ds.sink_to(es_sink).name('es sink')
import json from pyflink.common.serialization import SimpleStringSchema, SimpleStringEncoder, JsonRowDeserializationSchema from pyflink.datastream import StreamExecutionEnvironment from pyflink.datastream.connectors import FlinkKafkaConsumer, StreamingFileSink from pyflink.common.typeinfo import Types from pyflink.datastream.functions import MapFunction s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) ti = Types.ROW_NAMED( ["app", 'busi', 'date', 'ip'], [Types.STRING(), Types.STRING(), Types.BIG_INT(), Types.STRING()]) builder = JsonRowDeserializationSchema.builder() builder.type_info(ti) jrds = builder.ignore_parse_errors().build() fkc = FlinkKafkaConsumer(topics="ULS-BUSI-LOG-dev", deserialization_schema=jrds, properties={ "bootstrap.servers": "10.100.1.16:9192", "group.id": "123", "auto.offset.reset": "earliest" }) fkc.set_start_from_earliest() src = s_env.add_source(fkc).map(lambda x: x.get("values")) src.add_sink( StreamingFileSink.for_row_format('C:\\tmp\\pyoutput', SimpleStringEncoder()).build())
def to_java_typeinfo(type_info: TypeInformation): if isinstance(type_info, BasicTypeInfo): basic_type = type_info._basic_type if basic_type == BasicType.STRING: j_typeinfo = JTypes.STRING elif basic_type == BasicType.BYTE: j_typeinfo = JTypes.LONG elif basic_type == BasicType.BOOLEAN: j_typeinfo = JTypes.BOOLEAN elif basic_type == BasicType.SHORT: j_typeinfo = JTypes.LONG elif basic_type == BasicType.INT: j_typeinfo = JTypes.LONG elif basic_type == BasicType.LONG: j_typeinfo = JTypes.LONG elif basic_type == BasicType.FLOAT: j_typeinfo = JTypes.DOUBLE elif basic_type == BasicType.DOUBLE: j_typeinfo = JTypes.DOUBLE elif basic_type == BasicType.CHAR: j_typeinfo = JTypes.STRING elif basic_type == BasicType.BIG_INT: j_typeinfo = JTypes.BIG_INT elif basic_type == BasicType.BIG_DEC: j_typeinfo = JTypes.BIG_DEC elif basic_type == BasicType.INSTANT: j_typeinfo = JTypes.INSTANT else: raise TypeError("Invalid BasicType %s." % basic_type) elif isinstance(type_info, PrimitiveArrayTypeInfo): element_type = type_info._element_type if element_type == Types.BOOLEAN(): j_typeinfo = JPrimitiveArrayTypeInfo.BOOLEAN_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.BYTE(): j_typeinfo = JPrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.SHORT(): j_typeinfo = JPrimitiveArrayTypeInfo.SHORT_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.INT(): j_typeinfo = JPrimitiveArrayTypeInfo.INT_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.LONG(): j_typeinfo = JPrimitiveArrayTypeInfo.LONG_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.FLOAT(): j_typeinfo = JPrimitiveArrayTypeInfo.FLOAT_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.DOUBLE(): j_typeinfo = JPrimitiveArrayTypeInfo.DOUBLE_PRIMITIVE_ARRAY_TYPE_INFO elif element_type == Types.CHAR(): j_typeinfo = JPrimitiveArrayTypeInfo.CHAR_PRIMITIVE_ARRAY_TYPE_INFO else: raise TypeError("Invalid element type for a primitive array.") elif isinstance(type_info, BasicArrayTypeInfo): element_type = type_info._element_type if element_type == Types.BOOLEAN(): j_typeinfo = JBasicArrayTypeInfo.BOOLEAN_ARRAY_TYPE_INFO elif element_type == Types.BYTE(): j_typeinfo = JBasicArrayTypeInfo.BYTE_ARRAY_TYPE_INFO elif element_type == Types.SHORT(): j_typeinfo = JBasicArrayTypeInfo.SHORT_ARRAY_TYPE_INFO elif element_type == Types.INT(): j_typeinfo = JBasicArrayTypeInfo.INT_ARRAY_TYPE_INFO elif element_type == Types.LONG(): j_typeinfo = JBasicArrayTypeInfo.LONG_ARRAY_TYPE_INFO elif element_type == Types.FLOAT(): j_typeinfo = JBasicArrayTypeInfo.FLOAT_ARRAY_TYPE_INFO elif element_type == Types.DOUBLE(): j_typeinfo = JBasicArrayTypeInfo.DOUBLE_ARRAY_TYPE_INFO elif element_type == Types.CHAR(): j_typeinfo = JBasicArrayTypeInfo.CHAR_ARRAY_TYPE_INFO elif element_type == Types.STRING(): j_typeinfo = JBasicArrayTypeInfo.STRING_ARRAY_TYPE_INFO else: raise TypeError("Invalid element type for a basic array.") elif isinstance(type_info, ObjectArrayTypeInfo): element_type = type_info._element_type j_typeinfo = JTypes.OBJECT_ARRAY(to_java_typeinfo(element_type)) elif isinstance(type_info, MapTypeInfo): j_key_typeinfo = to_java_typeinfo(type_info._key_type_info) j_value_typeinfo = to_java_typeinfo(type_info._value_type_info) j_typeinfo = JMapTypeInfo(j_key_typeinfo, j_value_typeinfo) else: j_typeinfo = JPickledByteArrayTypeInfo.PICKLED_BYTE_ARRAY_TYPE_INFO return j_typeinfo