def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format(StreamFormat.text_line_format(), input_path) .process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source" ) else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy()) .build() ) else: print("Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def write_to_kafka(env): ds = env.from_collection([(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')], type_info=Types.ROW([Types.INT(), Types.STRING()])) serialization_schema = AvroRowSerializationSchema(avro_schema_string=""" { "type": "record", "name": "TestRecord", "fields": [ {"name": "id", "type": "int"}, {"name": "name", "type": "string"} ] }""") kafka_producer = FlinkKafkaProducer( topic='test_avro_topic', serialization_schema=serialization_schema, producer_config={ 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' }) # note that the output type of ds must be RowTypeInfo ds.add_sink(kafka_producer) env.execute()
def test_cassandra_sink(self): type_info = Types.ROW([Types.STRING(), Types.INT()]) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=type_info) cassandra_sink_builder = CassandraSink.add_sink(ds) cassandra_sink = cassandra_sink_builder\ .set_host('localhost', 9876) \ .set_query('query') \ .enable_ignore_null_fields() \ .set_mapper_options(MapperOptions() .ttl(1) .timestamp(100) .tracing(True) .if_not_exists(False) .consistency_level(ConsistencyLevel.ANY) .save_null_fields(True)) \ .set_max_concurrent_requests(1000) \ .build() cassandra_sink.name('cassandra_sink').set_parallelism(3) plan = eval(self.env.get_execution_plan()) self.assertEqual("Sink: cassandra_sink", plan['nodes'][1]['type']) self.assertEqual(3, plan['nodes'][1]['parallelism'])
def test_csv_row_serialization_schema(self): jvm = get_gateway().jvm JRow = jvm.org.apache.flink.types.Row j_row = JRow(3) j_row.setField(0, "BEGIN") j_row.setField(2, "END") def field_assertion(field_info, csv_value, value, field_delimiter): row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()]) expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n" j_row.setField(1, value) csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) csv_row_deserialization_schema._j_deserialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize( j_row) self.assertEqual(expected_csv, str(serialized_bytes, encoding='utf-8')) j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\ .deserialize(expected_csv.encode("utf-8")) self.assertTrue(j_row.equals(j_deserialized_row)) field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";") field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";") field_assertion(Types.INT(), "12", 12, ";") test_j_row = JRow(2) test_j_row.setField(0, "1") test_j_row.setField(1, "hello") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello'", test_j_row, ";") test_j_row.setField(1, "hello world") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello world'", test_j_row, ";") field_assertion(Types.STRING(), "null", "null", ";")
def read_from_kafka(env): deserialization_schema = JsonRowDeserializationSchema.Builder() \ .type_info(Types.ROW([Types.INT(), Types.STRING()])) \ .build() kafka_consumer = FlinkKafkaConsumer( topics='test_csv_topic', deserialization_schema=deserialization_schema, properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group_1'} ) kafka_consumer.set_start_from_earliest() env.add_source(kafka_consumer).print() env.execute()
def write_to_kafka(env): type_info = Types.ROW([Types.INT(), Types.STRING()]) ds = env.from_collection([ (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')], type_info=type_info) serialization_schema = CsvRowSerializationSchema.Builder(type_info).build() kafka_producer = FlinkKafkaProducer( topic='test_csv_topic', serialization_schema=serialization_schema, producer_config={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} ) # note that the output type of ds must be RowTypeInfo ds.add_sink(kafka_producer) env.execute()
def test_jdbc_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\ .with_driver_name('com.mysql.jdbc.Driver')\ .with_user_name('root')\ .with_password('password')\ .with_url('jdbc:mysql://server-name:server-port/database-name').build() jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\ .with_batch_size(100).with_max_retries(5).build() jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options, jdbc_execution_options) ds.add_sink(jdbc_sink).name('jdbc sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type']) j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat') connection_options = JdbcConnectionOptions( get_field_value( get_field_value(j_output_format, 'connectionProvider'), 'jdbcOptions')) self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url()) self.assertEqual(jdbc_connection_options.get_driver_name(), connection_options.get_driver_name()) self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password()) self.assertEqual(jdbc_connection_options.get_user_name(), connection_options.get_user_name()) exec_options = JdbcExecutionOptions( get_field_value(j_output_format, 'executionOptions')) self.assertEqual(jdbc_execution_options.get_batch_interval_ms(), exec_options.get_batch_interval_ms()) self.assertEqual(jdbc_execution_options.get_batch_size(), exec_options.get_batch_size()) self.assertEqual(jdbc_execution_options.get_max_retries(), exec_options.get_max_retries())
def basic_operations(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) # define the source ds = env.from_collection(collection=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], type_info=Types.ROW_NAMED( ["id", "info"], [Types.INT(), Types.STRING()])) # map def update_tel(data): # parse the json json_data = json.loads(data.info) json_data['tel'] += 1 return data.id, json.dumps(json_data) show(ds.map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}') # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}') # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}') # filter show(ds.filter(lambda data: data.id == 1).map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # key by show( ds.map(lambda data: (json.loads(data.info)['addr']['country'], json.loads(data.info)['tel'])).key_by( lambda data: data[0]).sum(1), env)
def test_json_row_serialization_deserialization_schema(self): jvm = get_gateway().jvm jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\", " "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"},\"ids\":[1, 2, 3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"}" ] expected_jsons = [ "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\"," "\"ops\":{\"id\":\"281708d0-4092-4c21-9233-931950b6eccf\"}," "\"ids\":[1,2,3]}", "{\"svt\":\"2020-02-24T12:58:09.209+0800\",\"ops\":null,\"ids\":null}" ] row_schema = Types.ROW_NAMED(["svt", "ops", "ids"], [ Types.STRING(), Types.ROW_NAMED(['id'], [Types.STRING()]), Types.PRIMITIVE_ARRAY(Types.INT()) ]) json_row_serialization_schema = JsonRowSerializationSchema.builder() \ .with_type_info(row_schema).build() json_row_deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(row_schema).build() json_row_serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) json_row_deserialization_schema._j_deserialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) for i in range(len(jsons)): j_row = json_row_deserialization_schema._j_deserialization_schema\ .deserialize(bytes(jsons[i], encoding='utf-8')) result = str(json_row_serialization_schema._j_serialization_schema. serialize(j_row), encoding='utf-8') self.assertEqual(expected_jsons[i], result)
def setUp(self): super(VectorAssemblerTest, self).setUp() self.input_data_table = self.t_env.from_data_stream( self.env.from_collection([ (0, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse( 5, [3], [1.0])), (1, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse(5, [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0])), (2, None, None, None), ], type_info=Types.ROW_NAMED( ['id', 'vec', 'num', 'sparse_vec'], [ Types.INT(), DenseVectorTypeInfo(), Types.DOUBLE(), SparseVectorTypeInfo() ]))) self.expected_output_data_1 = Vectors.sparse(8, [0, 1, 2, 6], [2.1, 3.1, 1.0, 1.0]) self.expected_output_data_2 = Vectors.dense(2.1, 3.1, 1.0, 0.0, 1.0, 2.0, 3.0, 4.0)
def test_rabbitmq_connectors(self): connection_config = RMQConnectionConfig.Builder() \ .set_host('localhost') \ .set_port(5672) \ .set_virtual_host('/') \ .set_user_name('guest') \ .set_password('guest') \ .build() type_info = Types.ROW([Types.INT(), Types.STRING()]) deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() rmq_source = RMQSource( connection_config, 'source_queue', True, deserialization_schema) self.assertEqual( get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue') self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId')) serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema) self.assertEqual( get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
argv = sys.argv[1:] known_args, _ = parser.parse_known_args(argv) output_path = known_args.output env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 5), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE( [Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(TumblingEventTimeWindows.of(Time.milliseconds(5))) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to(sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()
dest='output', required=False, help='Output file to write results to.') argv = sys.argv[1:] known_args, _ = parser.parse_known_args(argv) output_path = known_args.output env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([ (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')], type_info=Types.TUPLE([Types.INT(), Types.STRING()])) ds = data_stream.key_by(lambda x: x[1], key_type=Types.STRING()) \ .count_window(2) \ .apply(SumWindowFunction(), Types.TUPLE([Types.STRING(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext")
def setUp(self): super(BinaryClassificationEvaluatorTest, self).setUp() self.input_data_table = self.t_env.from_data_stream( self.env.from_collection([ (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.2, 0.8)), (1.0, Vectors.dense(0.3, 0.7)), (0.0, Vectors.dense(0.25, 0.75)), (0.0, Vectors.dense(0.4, 0.6)), (1.0, Vectors.dense(0.35, 0.65)), (1.0, Vectors.dense(0.45, 0.55)), (0.0, Vectors.dense(0.6, 0.4)), (0.0, Vectors.dense(0.7, 0.3)), (1.0, Vectors.dense(0.65, 0.35)), (0.0, Vectors.dense(0.8, 0.2)), (1.0, Vectors.dense(0.9, 0.1)) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction'], [Types.DOUBLE(), DenseVectorTypeInfo()])) ) self.input_data_table_score = self.t_env.from_data_stream( self.env.from_collection([ (1, 0.9), (1, 0.8), (1, 0.7), (0, 0.75), (0, 0.6), (1, 0.65), (1, 0.55), (0, 0.4), (0, 0.3), (1, 0.35), (0, 0.2), (1, 0.1) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction'], [Types.INT(), Types.DOUBLE()])) ) self.input_data_table_with_multi_score = self.t_env.from_data_stream( self.env.from_collection([ (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.1, 0.9)), (0.0, Vectors.dense(0.25, 0.75)), (0.0, Vectors.dense(0.4, 0.6)), (1.0, Vectors.dense(0.1, 0.9)), (1.0, Vectors.dense(0.1, 0.9)), (0.0, Vectors.dense(0.6, 0.4)), (0.0, Vectors.dense(0.7, 0.3)), (1.0, Vectors.dense(0.1, 0.9)), (0.0, Vectors.dense(0.8, 0.2)), (1.0, Vectors.dense(0.9, 0.1)) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction'], [Types.DOUBLE(), DenseVectorTypeInfo()])) ) self.input_data_table_with_weight = self.t_env.from_data_stream( self.env.from_collection([ (1.0, Vectors.dense(0.1, 0.9), 0.8), (1.0, Vectors.dense(0.1, 0.9), 0.7), (1.0, Vectors.dense(0.1, 0.9), 0.5), (0.0, Vectors.dense(0.25, 0.75), 1.2), (0.0, Vectors.dense(0.4, 0.6), 1.3), (1.0, Vectors.dense(0.1, 0.9), 1.5), (1.0, Vectors.dense(0.1, 0.9), 1.4), (0.0, Vectors.dense(0.6, 0.4), 0.3), (0.0, Vectors.dense(0.7, 0.3), 0.5), (1.0, Vectors.dense(0.1, 0.9), 1.9), (0.0, Vectors.dense(0.8, 0.2), 1.2), (1.0, Vectors.dense(0.9, 0.1), 1.0) ], type_info=Types.ROW_NAMED( ['label', 'rawPrediction', 'weight'], [Types.DOUBLE(), DenseVectorTypeInfo(), Types.DOUBLE()])) ) self.expected_data = [0.7691481137909708, 0.3714285714285714, 0.6571428571428571] self.expected_data_m = [0.8571428571428571, 0.9377705627705628, 0.8571428571428571, 0.6488095238095237] self.expected_data_w = 0.8911680911680911 self.eps = 1e-5
dest='output', required=False, help='Output file to write results to.') argv = sys.argv[1:] known_args, _ = parser.parse_known_args(argv) output_path = known_args.output env = StreamExecutionEnvironment.get_execution_environment() # write all the data to one file env.set_parallelism(1) # define the source data_stream = env.from_collection([ ('hi', 1), ('hi', 2), ('hi', 3), ('hi', 4), ('hi', 8), ('hi', 9), ('hi', 15)], type_info=Types.TUPLE([Types.STRING(), Types.INT()])) # define the watermark strategy watermark_strategy = WatermarkStrategy.for_monotonous_timestamps() \ .with_timestamp_assigner(MyTimestampAssigner()) ds = data_stream.assign_timestamps_and_watermarks(watermark_strategy) \ .key_by(lambda x: x[0], key_type=Types.STRING()) \ .window(EventTimeSessionWindows.with_dynamic_gap(MySessionWindowTimeGapExtractor())) \ .process(CountWindowProcessFunction(), Types.TUPLE([Types.STRING(), Types.INT(), Types.INT(), Types.INT()])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format(
def test_pulsar_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) TEST_OPTION_NAME = 'pulsar.producer.chunkingEnabled' pulsar_sink = PulsarSink.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_producer_name('fo') \ .set_topics('ada') \ .set_serialization_schema( PulsarSerializationSchema.flink_schema(SimpleStringSchema())) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_topic_routing_mode(TopicRoutingMode.ROUND_ROBIN) \ .delay_sending_message(MessageDelayer.fixed(Duration.of_seconds(12))) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.producer.batchingMaxMessages': '100'}) \ .build() ds.sink_to(pulsar_sink).name('pulsar sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('pulsar sink: Writer', plan['nodes'][1]['type']) configuration = get_field_value(pulsar_sink.get_java_function(), "sinkConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl').string_type(). no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl').string_type(). no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.producer.producerName').string_type( ).no_default_value()._j_config_option), 'fo - %s') j_pulsar_serialization_schema = get_field_value( pulsar_sink.get_java_function(), 'serializationSchema') j_serialization_schema = get_field_value(j_pulsar_serialization_schema, 'serializationSchema') self.assertTrue( is_instance_of( j_serialization_schema, 'org.apache.flink.api.common.serialization.SimpleStringSchema') ) self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.sink.deliveryGuarantee').string_type( ).no_default_value()._j_config_option), 'at-least-once') j_topic_router = get_field_value(pulsar_sink.get_java_function(), "topicRouter") self.assertTrue( is_instance_of( j_topic_router, 'org.apache.flink.connector.pulsar.sink.writer.router.RoundRobinTopicRouter' )) j_message_delayer = get_field_value(pulsar_sink.get_java_function(), 'messageDelayer') delay_duration = get_field_value(j_message_delayer, 'delayDuration') self.assertEqual(delay_duration, 12000) test_option = ConfigOptions.key( TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.producer.batchingMaxMessages'). long_type().no_default_value()._j_config_option), 100)