def word_count(input_path, output_path): env = StreamExecutionEnvironment.get_execution_environment() env.set_runtime_mode(RuntimeExecutionMode.BATCH) # write all the data to one file env.set_parallelism(1) # define the source if input_path is not None: ds = env.from_source( source=FileSource.for_record_stream_format(StreamFormat.text_line_format(), input_path) .process_static_file_set().build(), watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="file_source" ) else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") ds = env.from_collection(word_count_data) def split(line): yield from line.split() # compute word count ds = ds.flat_map(split) \ .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \ .key_by(lambda i: i[0]) \ .reduce(lambda i, j: (i[0], i[1] + j[1])) # define the sink if output_path is not None: ds.sink_to( sink=FileSink.for_row_format( base_path=output_path, encoder=Encoder.simple_string_encoder()) .with_output_file_config( OutputFileConfig.builder() .with_part_prefix("prefix") .with_part_suffix(".ext") .build()) .with_rolling_policy(RollingPolicy.default_rolling_policy()) .build() ) else: print("Printing result to stdout. Use --output to specify output path.") ds.print() # submit for execution env.execute()
def test_cassandra_sink(self): type_info = Types.ROW([Types.STRING(), Types.INT()]) ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=type_info) cassandra_sink_builder = CassandraSink.add_sink(ds) cassandra_sink = cassandra_sink_builder\ .set_host('localhost', 9876) \ .set_query('query') \ .enable_ignore_null_fields() \ .set_mapper_options(MapperOptions() .ttl(1) .timestamp(100) .tracing(True) .if_not_exists(False) .consistency_level(ConsistencyLevel.ANY) .save_null_fields(True)) \ .set_max_concurrent_requests(1000) \ .build() cassandra_sink.name('cassandra_sink').set_parallelism(3) plan = eval(self.env.get_execution_plan()) self.assertEqual("Sink: cassandra_sink", plan['nodes'][1]['type']) self.assertEqual(3, plan['nodes'][1]['parallelism'])
def setUp(self): super(OneHotEncoderTest, self).setUp() self.train_data = self.t_env.from_data_stream( self.env.from_collection([ (0.0,), (1.0,), (2.0,), (0.0,), ], type_info=Types.ROW_NAMED( ['input'], [Types.DOUBLE()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection([ (0.0,), (1.0,), (2.0,), ], type_info=Types.ROW_NAMED( ['input'], [Types.DOUBLE()]))) self.expected_data = { 0.0: Vectors.sparse(2, [0], [1.0]), 1.0: Vectors.sparse(2, [1], [1.0]), 2.0: Vectors.sparse(2, [], []) } self.estimator = OneHotEncoder().set_input_cols('input').set_output_cols('output')
def field_assertion(field_info, csv_value, value, field_delimiter): row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()]) expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n" j_row.setField(1, value) csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) csv_row_deserialization_schema._j_deserialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize( j_row) self.assertEqual(expected_csv, str(serialized_bytes, encoding='utf-8')) j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\ .deserialize(expected_csv.encode("utf-8")) self.assertTrue(j_row.equals(j_deserialized_row))
def write_to_es7(env): ELASTICSEARCH_SQL_CONNECTOR_PATH = \ 'file:///path/to/flink-sql-connector-elasticsearch7-1.16.0.jar' env.add_jars(ELASTICSEARCH_SQL_CONNECTOR_PATH) ds = env.from_collection([{ 'name': 'ada', 'id': '1' }, { 'name': 'luna', 'id': '2' }], type_info=Types.MAP(Types.STRING(), Types.STRING())) es7_sink = Elasticsearch7SinkBuilder() \ .set_emitter(ElasticsearchEmitter.static_index('foo', 'id')) \ .set_hosts(['localhost:9200']) \ .set_delivery_guarantee(DeliveryGuarantee.AT_LEAST_ONCE) \ .set_bulk_flush_max_actions(1) \ .set_bulk_flush_max_size_mb(2) \ .set_bulk_flush_interval(1000) \ .set_bulk_flush_backoff_strategy(FlushBackoffType.CONSTANT, 3, 3000) \ .set_connection_username('foo') \ .set_connection_password('bar') \ .set_connection_path_prefix('foo-bar') \ .set_connection_request_timeout(30000) \ .set_connection_timeout(31000) \ .set_socket_timeout(32000) \ .build() ds.sink_to(es7_sink).name('es7 sink') env.execute()
def write_to_kafka(env): ds = env.from_collection([(1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')], type_info=Types.ROW([Types.INT(), Types.STRING()])) serialization_schema = AvroRowSerializationSchema(avro_schema_string=""" { "type": "record", "name": "TestRecord", "fields": [ {"name": "id", "type": "int"}, {"name": "name", "type": "string"} ] }""") kafka_producer = FlinkKafkaProducer( topic='test_avro_topic', serialization_schema=serialization_schema, producer_config={ 'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group' }) # note that the output type of ds must be RowTypeInfo ds.add_sink(kafka_producer) env.execute()
def test_es_sink_dynamic(self): ds = self.env.from_collection([{ 'name': 'ada', 'id': '1' }, { 'name': 'luna', 'id': '2' }], type_info=Types.MAP( Types.STRING(), Types.STRING())) es_dynamic_index_sink = Elasticsearch7SinkBuilder() \ .set_emitter(ElasticsearchEmitter.dynamic_index('name', 'id')) \ .set_hosts(['localhost:9200']) \ .build() j_emitter = get_field_value(es_dynamic_index_sink.get_java_function(), 'emitter') self.assertTrue( is_instance_of( j_emitter, 'org.apache.flink.connector.elasticsearch.sink.MapElasticsearchEmitter' )) ds.sink_to(es_dynamic_index_sink).name('es dynamic index sink')
def setUp(self): super(MinMaxScalerTest, self).setUp() self.train_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([0.0, 3.0]), ), (Vectors.dense([2.1, 0.0]), ), (Vectors.dense([4.1, 5.1]), ), (Vectors.dense([6.1, 8.1]), ), (Vectors.dense([200., 400.]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([150.0, 90.0]), ), (Vectors.dense([50.0, 40.0]), ), (Vectors.dense([100.0, 50.0]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) self.expected_data = [ Vectors.dense(0.25, 0.1), Vectors.dense(0.5, 0.125), Vectors.dense(0.75, 0.225) ]
def pandas_udaf(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() # define the tumble window operation table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, mean_udaf(table.price), col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def tumble_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts", "name", "price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .build()) .build()) # define the over window operation table = table.over_window( Over.partition_by(col("name")) .order_by(col("ts")) .preceding(row_interval(2)) .following(CURRENT_ROW) .alias('w')) \ .select(table.name, table.price.max.over(col('w'))) # submit for execution table.execute_insert('sink') \ .wait()
def test_csv_row_serialization_schema(self): jvm = get_gateway().jvm JRow = jvm.org.apache.flink.types.Row j_row = JRow(3) j_row.setField(0, "BEGIN") j_row.setField(2, "END") def field_assertion(field_info, csv_value, value, field_delimiter): row_info = Types.ROW([Types.STRING(), field_info, Types.STRING()]) expected_csv = "BEGIN" + field_delimiter + csv_value + field_delimiter + "END\n" j_row.setField(1, value) csv_row_serialization_schema = CsvRowSerializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_deserialization_schema = CsvRowDeserializationSchema.Builder(row_info)\ .set_escape_character('*').set_quote_character('\'')\ .set_array_element_delimiter(':').set_field_delimiter(';').build() csv_row_serialization_schema._j_serialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) csv_row_deserialization_schema._j_deserialization_schema.open( jvm.org.apache.flink.connector.testutils.formats. DummyInitializationContext()) serialized_bytes = csv_row_serialization_schema._j_serialization_schema.serialize( j_row) self.assertEqual(expected_csv, str(serialized_bytes, encoding='utf-8')) j_deserialized_row = csv_row_deserialization_schema._j_deserialization_schema\ .deserialize(expected_csv.encode("utf-8")) self.assertTrue(j_row.equals(j_deserialized_row)) field_assertion(Types.STRING(), "'123''4**'", "123'4*", ";") field_assertion(Types.STRING(), "'a;b''c'", "a;b'c", ";") field_assertion(Types.INT(), "12", 12, ";") test_j_row = JRow(2) test_j_row.setField(0, "1") test_j_row.setField(1, "hello") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello'", test_j_row, ";") test_j_row.setField(1, "hello world") field_assertion(Types.ROW([Types.STRING(), Types.STRING()]), "'1:hello world'", test_j_row, ";") field_assertion(Types.STRING(), "null", "null", ";")
def read_from_kafka(env): deserialization_schema = JsonRowDeserializationSchema.Builder() \ .type_info(Types.ROW([Types.INT(), Types.STRING()])) \ .build() kafka_consumer = FlinkKafkaConsumer( topics='test_csv_topic', deserialization_schema=deserialization_schema, properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group_1'} ) kafka_consumer.set_start_from_earliest() env.add_source(kafka_consumer).print() env.execute()
def test_fewer_distinct_points_than_cluster(self): input = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0.0, 0.1]), ), (Vectors.dense([0.0, 0.1]), ), (Vectors.dense([0.0, 0.1]), ), ], type_info=Types.ROW_NAMED(['features'], [DenseVectorTypeInfo()]))) kmeans = KMeans().set_k(2) model = kmeans.fit(input) output = model.transform(input)[0] results = [ result for result in self.t_env.to_data_stream( output).execute_and_collect() ] field_names = output.get_schema().get_field_names() actual_groups = group_features_by_prediction( results, field_names.index(kmeans.features_col), field_names.index(kmeans.prediction_col)) expected_groups = [{DenseVector([0.0, 0.1])}] self.assertEqual(actual_groups, expected_groups)
def _build_csv_job(self, schema): source = FileSource.for_record_stream_format( CsvReaderFormat.for_schema(schema), self.csv_file_name).build() ds = self.env.from_source(source, WatermarkStrategy.no_watermarks(), 'csv-source') ds.map(PassThroughMapFunction(), output_type=Types.PICKLED_BYTE_ARRAY()) \ .add_sink(self.test_sink)
def setUp(self): super(StandardScalerTest, self).setUp() self.dense_input = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense(-2.5, 9.0, 1.0), ), (Vectors.dense(1.4, -5.0, 1.0), ), (Vectors.dense(2.0, -1.0, -2.0), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) self.expected_res_with_mean = [ Vectors.dense(-2.8, 8.0, 1.0), Vectors.dense(1.1, -6.0, 1.0), Vectors.dense(1.7, -2.0, -2.0) ] self.expected_res_with_std = [ Vectors.dense(-1.0231819, 1.2480754, 0.5773502), Vectors.dense(0.5729819, -0.6933752, 0.5773503), Vectors.dense(0.8185455, -0.1386750, -1.1547005) ] self.expected_res_with_mean_and_std = [ Vectors.dense(-1.1459637, 1.1094004, 0.5773503), Vectors.dense(0.45020003, -0.8320503, 0.5773503), Vectors.dense(0.69576368, -0.2773501, -1.1547005) ] self.expected_mean = [0.3, 1.0, 0.0] self.expected_std = [2.4433583, 7.2111026, 1.7320508]
def write_to_kafka(env): type_info = Types.ROW([Types.INT(), Types.STRING()]) ds = env.from_collection([ (1, 'hi'), (2, 'hello'), (3, 'hi'), (4, 'hello'), (5, 'hi'), (6, 'hello'), (6, 'hello')], type_info=type_info) serialization_schema = CsvRowSerializationSchema.Builder(type_info).build() kafka_producer = FlinkKafkaProducer( topic='test_csv_topic', serialization_schema=serialization_schema, producer_config={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'} ) # note that the output type of ds must be RowTypeInfo ds.add_sink(kafka_producer) env.execute()
def basic_operations(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) # define the source ds = env.from_collection(collection=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], type_info=Types.ROW_NAMED( ["id", "info"], [Types.INT(), Types.STRING()])) # map def update_tel(data): # parse the json json_data = json.loads(data.info) json_data['tel'] += 1 return data.id, json.dumps(json_data) show(ds.map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}') # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}') # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}') # filter show(ds.filter(lambda data: data.id == 1).map(update_tel), env) # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}') # key by show( ds.map(lambda data: (json.loads(data.info)['addr']['country'], json.loads(data.info)['tel'])).key_by( lambda data: data[0]).sum(1), env)
def test_jdbc_sink(self): ds = self.env.from_collection([('ab', 1), ('bdc', 2), ('cfgs', 3), ('deeefg', 4)], type_info=Types.ROW( [Types.STRING(), Types.INT()])) jdbc_connection_options = JdbcConnectionOptions.JdbcConnectionOptionsBuilder()\ .with_driver_name('com.mysql.jdbc.Driver')\ .with_user_name('root')\ .with_password('password')\ .with_url('jdbc:mysql://server-name:server-port/database-name').build() jdbc_execution_options = JdbcExecutionOptions.builder().with_batch_interval_ms(2000)\ .with_batch_size(100).with_max_retries(5).build() jdbc_sink = JdbcSink.sink("insert into test table", ds.get_type(), jdbc_connection_options, jdbc_execution_options) ds.add_sink(jdbc_sink).name('jdbc sink') plan = eval(self.env.get_execution_plan()) self.assertEqual('Sink: jdbc sink', plan['nodes'][1]['type']) j_output_format = get_field_value(jdbc_sink.get_java_function(), 'outputFormat') connection_options = JdbcConnectionOptions( get_field_value( get_field_value(j_output_format, 'connectionProvider'), 'jdbcOptions')) self.assertEqual(jdbc_connection_options.get_db_url(), connection_options.get_db_url()) self.assertEqual(jdbc_connection_options.get_driver_name(), connection_options.get_driver_name()) self.assertEqual(jdbc_connection_options.get_password(), connection_options.get_password()) self.assertEqual(jdbc_connection_options.get_user_name(), connection_options.get_user_name()) exec_options = JdbcExecutionOptions( get_field_value(j_output_format, 'executionOptions')) self.assertEqual(jdbc_execution_options.get_batch_interval_ms(), exec_options.get_batch_interval_ms()) self.assertEqual(jdbc_execution_options.get_batch_size(), exec_options.get_batch_size()) self.assertEqual(jdbc_execution_options.get_max_retries(), exec_options.get_max_retries())
def test_pulsar_source(self): TEST_OPTION_NAME = 'pulsar.source.enableAutoAcknowledgeMessage' pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topics('ada') \ .set_start_cursor(StartCursor.earliest()) \ .set_unbounded_stop_cursor(StopCursor.never()) \ .set_bounded_stop_cursor(StopCursor.at_publish_time(22)) \ .set_subscription_name('ff') \ .set_subscription_type(SubscriptionType.Exclusive) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING())) \ .set_deserialization_schema( PulsarDeserializationSchema.flink_schema(SimpleStringSchema())) \ .set_config(TEST_OPTION_NAME, True) \ .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() ds = self.env.from_source( source=pulsar_source, watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(), source_name="pulsar source") ds.print() plan = eval(self.env.get_execution_plan()) self.assertEqual('Source: pulsar source', plan['nodes'][0]['type']) configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.client.serviceUrl').string_type(). no_default_value()._j_config_option), 'pulsar://localhost:6650') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.admin.adminUrl').string_type(). no_default_value()._j_config_option), 'http://localhost:8080') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionName'). string_type().no_default_value()._j_config_option), 'ff') self.assertEqual( configuration.getString( ConfigOptions.key('pulsar.consumer.subscriptionType'). string_type().no_default_value()._j_config_option), SubscriptionType.Exclusive.name) test_option = ConfigOptions.key( TEST_OPTION_NAME).boolean_type().no_default_value() self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval'). long_type().no_default_value()._j_config_option), 1000)
def setUp(self): super(LogisticRegressionTest, self).setUp() self.binomial_data_table = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([1, 2, 3, 4]), 0., 1.), (Vectors.dense([2, 2, 3, 4]), 0., 2.), (Vectors.dense([3, 2, 3, 4]), 0., 3.), (Vectors.dense([4, 2, 3, 4]), 0., 4.), (Vectors.dense([5, 2, 3, 4]), 0., 5.), (Vectors.dense([11, 2, 3, 4]), 1., 1.), (Vectors.dense([12, 2, 3, 4]), 1., 2.), (Vectors.dense([13, 2, 3, 4]), 1., 3.), (Vectors.dense([14, 2, 3, 4]), 1., 4.), (Vectors.dense([15, 2, 3, 4]), 1., 5.), ], type_info=Types.ROW_NAMED( ['features', 'label', 'weight'], [DenseVectorTypeInfo(), Types.DOUBLE(), Types.DOUBLE()])))
def setUp(self): super(StringIndexerTest, self).setUp() self.train_table = self.t_env.from_data_stream( self.env.from_collection([ ('a', 1.0), ('b', 1.0), ('b', 2.0), ('c', 0.0), ('d', 2.0), ('a', 2.0), ('b', 2.0), ('b', -1.0), ('a', -1.0), ('c', -1.0), ], type_info=Types.ROW_NAMED( ['input_col1', 'input_col2'], [Types.STRING(), Types.DOUBLE()]))) self.predict_table = self.t_env.from_data_stream( self.env.from_collection([ ('a', 2.0), ('b', 1.0), ('e', 2.0), ], type_info=Types.ROW_NAMED( ['input_col1', 'input_col2'], [Types.STRING(), Types.DOUBLE()]))) self.expected_alphabetic_asc_predict_data = [ Row('a', 2.0, 0, 3), Row('b', 1.0, 1, 2), Row('e', 2.0, 4, 3) ]
def setUp(self): super(VectorAssemblerTest, self).setUp() self.input_data_table = self.t_env.from_data_stream( self.env.from_collection([ (0, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse( 5, [3], [1.0])), (1, Vectors.dense(2.1, 3.1), 1.0, Vectors.sparse(5, [1, 2, 3, 4], [1.0, 2.0, 3.0, 4.0])), (2, None, None, None), ], type_info=Types.ROW_NAMED( ['id', 'vec', 'num', 'sparse_vec'], [ Types.INT(), DenseVectorTypeInfo(), Types.DOUBLE(), SparseVectorTypeInfo() ]))) self.expected_output_data_1 = Vectors.sparse(8, [0, 1, 2, 6], [2.1, 3.1, 1.0, 1.0]) self.expected_output_data_2 = Vectors.dense(2.1, 3.1, 1.0, 0.0, 1.0, 2.0, 3.0, 4.0)
def setUp(self): super(NaiveBayesTest, self).setUp() self.env.set_parallelism(1) self.train_data = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0, 0.]), 11.), (Vectors.dense([1, 0]), 10.), (Vectors.dense([1, 1.]), 10.), ], type_info=Types.ROW_NAMED( ['features', 'label'], [DenseVectorTypeInfo(), Types.DOUBLE()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0, 1.]), ), (Vectors.dense([0, 0.]), ), (Vectors.dense([1, 0]), ), (Vectors.dense([1, 1.]), ), ], type_info=Types.ROW_NAMED(['features'], [DenseVectorTypeInfo()]))) self.expected_output = { Vectors.dense([0, 1.]): 11., Vectors.dense([0, 0.]): 11., Vectors.dense([1, 0.]): 10., Vectors.dense([1, 1.]): 10., } self.estimator = NaiveBayes() \ .set_smoothing(1.0) \ .set_features_col('features') \ .set_label_col('label') \ .set_prediction_col('prediction') \ .set_model_type('multinomial') # type: NaiveBayes
def setUp(self): super(KNNTest, self).setUp() self.train_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([2.0, 3.0]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([200.1, 300.1]), 2.0), (Vectors.dense([200.2, 300.2]), 2.0), (Vectors.dense([200.3, 300.3]), 2.0), (Vectors.dense([200.4, 300.4]), 2.0), (Vectors.dense([200.4, 300.4]), 2.0), (Vectors.dense([200.6, 300.6]), 2.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.1, 3.1]), 1.0), (Vectors.dense([2.3, 3.2]), 1.0), (Vectors.dense([2.3, 3.2]), 1.0), (Vectors.dense([2.8, 3.2]), 3.0), (Vectors.dense([300., 3.2]), 4.0), (Vectors.dense([2.2, 3.2]), 1.0), (Vectors.dense([2.4, 3.2]), 5.0), (Vectors.dense([2.5, 3.2]), 5.0), (Vectors.dense([2.5, 3.2]), 5.0), (Vectors.dense([2.1, 3.1]), 1.0) ], type_info=Types.ROW_NAMED( ['features', 'label'], [DenseVectorTypeInfo(), Types.DOUBLE()]))) self.predict_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([4.0, 4.1]), 5.0), (Vectors.dense([300, 42]), 2.0), ], type_info=Types.ROW_NAMED( ['features', 'label'], [DenseVectorTypeInfo(), Types.DOUBLE()])))
def test_rabbitmq_connectors(self): connection_config = RMQConnectionConfig.Builder() \ .set_host('localhost') \ .set_port(5672) \ .set_virtual_host('/') \ .set_user_name('guest') \ .set_password('guest') \ .build() type_info = Types.ROW([Types.INT(), Types.STRING()]) deserialization_schema = JsonRowDeserializationSchema.builder() \ .type_info(type_info=type_info).build() rmq_source = RMQSource( connection_config, 'source_queue', True, deserialization_schema) self.assertEqual( get_field_value(rmq_source.get_java_function(), 'queueName'), 'source_queue') self.assertTrue(get_field_value(rmq_source.get_java_function(), 'usesCorrelationId')) serialization_schema = JsonRowSerializationSchema.builder().with_type_info(type_info) \ .build() rmq_sink = RMQSink(connection_config, 'sink_queue', serialization_schema) self.assertEqual( get_field_value(rmq_sink.get_java_function(), 'queueName'), 'sink_queue')
def write_to_es6_dynamic_index(env): ELASTICSEARCH_SQL_CONNECTOR_PATH = \ 'file:///path/to/flink-sql-connector-elasticsearch6-1.16.0.jar' env.add_jars(ELASTICSEARCH_SQL_CONNECTOR_PATH) ds = env.from_collection([{ 'name': 'ada', 'id': '1' }, { 'name': 'luna', 'id': '2' }], type_info=Types.MAP(Types.STRING(), Types.STRING())) es_sink = Elasticsearch6SinkBuilder() \ .set_emitter(ElasticsearchEmitter.dynamic_index('name', 'id', 'bar')) \ .set_hosts(['localhost:9200']) \ .build() ds.sink_to(es_sink).name('es6 dynamic index sink') env.execute()
def test_max_value_equas_min_value_but_predict_value_not_equals(self): train_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([40.0, 80.0]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) predict_data = self.t_env.from_data_stream( self.env.from_collection([ (Vectors.dense([30.0, 50.0]), ), ], type_info=Types.ROW_NAMED( ['input'], [DenseVectorTypeInfo()]))) min_max_scalar = MinMaxScaler() \ .set_min(0.0) \ .set_max(10.0) model = min_max_scalar.fit(train_data) result = model.transform(predict_data)[0] self.verify_output_result(result, min_max_scalar.get_output_col(), result.get_schema().get_field_names(), [Vectors.dense(5.0, 5.0)])
def mixing_use_of_datastream_and_table(): # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source t_env.create_temporary_table( 'source', TableDescriptor.for_connector('datagen').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).option("number-of-rows", "10").build()) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('a', DataTypes.BIGINT()).build()).build()) @udf(result_type=DataTypes.BIGINT()) def length(data): return len(data) # perform table api operations table = t_env.from_path("source") table = table.select(col('id'), length(col('data'))) # convert table to datastream and perform datastream api operations ds = t_env.to_data_stream(table) ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG()) # convert datastream to table and perform table api operations as you want table = t_env.from_data_stream( ds, Schema.new_builder().column("f0", DataTypes.BIGINT()).build()) # execute table.execute_insert('sink') \ .wait()
def test_source_deprecated_method(self): test_option = ConfigOptions.key('pulsar.source.enableAutoAcknowledgeMessage') \ .boolean_type().no_default_value() pulsar_source = PulsarSource.builder() \ .set_service_url('pulsar://localhost:6650') \ .set_admin_url('http://localhost:8080') \ .set_topic_pattern('ada.*') \ .set_deserialization_schema( PulsarDeserializationSchema.flink_type_info(Types.STRING())) \ .set_unbounded_stop_cursor(StopCursor.at_publish_time(4444)) \ .set_subscription_name('ff') \ .set_config(test_option, True) \ .set_properties({'pulsar.source.autoCommitCursorInterval': '1000'}) \ .build() configuration = get_field_value(pulsar_source.get_java_function(), "sourceConfiguration") self.assertEqual( configuration.getBoolean(test_option._j_config_option), True) self.assertEqual( configuration.getLong( ConfigOptions.key('pulsar.source.autoCommitCursorInterval'). long_type().no_default_value()._j_config_option), 1000)
def setUp(self): super(KMeansTest, self).setUp() self.data_table = self.t_env.from_data_stream( self.env.from_collection( [ (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([0.0, 0.3]), ), (Vectors.dense([0.3, 3.0]), ), (Vectors.dense([9.0, 0.0]), ), (Vectors.dense([9.0, 0.6]), ), (Vectors.dense([9.6, 0.0]), ), ], type_info=Types.ROW_NAMED(['features'], [DenseVectorTypeInfo()]))) self.expected_groups = [{ DenseVector([0.0, 0.3]), DenseVector([0.3, 3.0]), DenseVector([0.0, 0.0]) }, { DenseVector([9.6, 0.0]), DenseVector([9.0, 0.0]), DenseVector([9.0, 0.6]) }]