def test_topic(self): kafka = Kafka().topic("topic1") properties = kafka.to_properties() expected = { 'connector.type': 'kafka', 'connector.topic': 'topic1', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_property(self): kafka = Kafka().property("group.id", "testGroup") properties = kafka.to_properties() expected = { 'connector.type': 'kafka', 'connector.properties.group.id': 'testGroup', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_start_from_group_offsets(self): kafka = Kafka().start_from_group_offsets() properties = kafka.to_properties() expected = { 'connector.type': 'kafka', 'connector.startup-mode': 'group-offsets', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_sink_partitioner_round_robin(self): kafka = Kafka().sink_partitioner_round_robin() properties = kafka.to_properties() expected = { 'connector.sink-partitioner': 'round-robin', 'connector.type': 'kafka', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_version(self): kafka = Kafka().version("0.11") properties = kafka.to_properties() expected = { 'connector.version': '0.11', 'connector.type': 'kafka', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_start_from_specific_offset(self): kafka = Kafka().start_from_specific_offset(3, 300) properties = kafka.to_properties() expected = { 'connector.startup-mode': 'specific-offsets', 'connector.specific-offsets': 'partition:3,offset:300', 'connector.type': 'kafka', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_properties(self): kafka = Kafka().properties({"bootstrap.servers": "localhost:9092"}) properties = kafka.to_properties() expected = { 'connector.type': 'kafka', 'connector.startup-mode': 'group-offsets', 'connector.properties.bootstrap.servers': 'localhost:9092', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_sink_partitioner_fixed(self): kafka = Kafka().sink_partitioner_fixed() properties = kafka.to_properties() expected = { 'connector.sink-partitioner': 'fixed', 'connector.startup-mode': 'group-offsets', 'connector.type': 'kafka', 'connector.property-version': '1' } self.assertEqual(expected, properties)
def test_start_from_specific_offsets(self): kafka = Kafka().start_from_specific_offsets({1: 220, 3: 400}) properties = kafka.to_properties() expected = {'connector.startup-mode': 'specific-offsets', 'connector.specific-offsets.0.partition': '1', 'connector.specific-offsets.0.offset': '220', 'connector.specific-offsets.1.partition': '3', 'connector.specific-offsets.1.offset': '400', 'connector.type': 'kafka', 'connector.property-version': '1'} self.assertEqual(expected, properties)
def test_properties(self): kafka = Kafka().properties({"zookeeper.connect": "localhost:2181", "bootstrap.servers": "localhost:9092"}) properties = kafka.to_properties() expected = {'connector.type': 'kafka', 'connector.properties.0.key': 'zookeeper.connect', 'connector.properties.0.value': 'localhost:2181', 'connector.properties.1.key': 'bootstrap.servers', 'connector.properties.1.value': 'localhost:9092', 'connector.property-version': '1'} self.assertEqual(expected, properties)
def test_sink_partitioner_custom(self): kafka = Kafka().sink_partitioner_custom( "org.apache.flink.streaming.connectors.kafka.partitioner.FlinkFixedPartitioner") properties = kafka.to_properties() expected = {'connector.sink-partitioner': 'custom', 'connector.sink-partitioner-class': 'org.apache.flink.streaming.connectors.kafka.partitioner.' 'FlinkFixedPartitioner', 'connector.type': 'kafka', 'connector.property-version': '1'} self.assertEqual(expected, properties)
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("eventTime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def register_rides_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("universal") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.STRING()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("eventTime", DataTypes.STRING())) \ .in_append_mode() \ .create_temporary_table("source")
def register_transactions_source(st_env): st_env.connect(Kafka() .version("universal") .topic("transactions-data") .start_from_latest() .property("zookeeper.connect", "host.docker.internal:2181") .property("bootstrap.servers", "host.docker.internal:19091")) \ .with_format(Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("customer", DataTypes.STRING()), DataTypes.FIELD("transaction_type", DataTypes.STRING()), DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()), DataTypes.FIELD("lat", DataTypes.DOUBLE()), DataTypes.FIELD("lon", DataTypes.DOUBLE()), DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \ .with_schema(Schema() .field("customer", DataTypes.STRING()) .field("transaction_type", DataTypes.STRING()) .field("online_payment_amount", DataTypes.DOUBLE()) .field("in_store_payment_amount", DataTypes.DOUBLE()) .field("lat", DataTypes.DOUBLE()) .field("lon", DataTypes.DOUBLE()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("transaction_datetime") .watermarks_periodic_bounded(60000))) \ .in_append_mode() \ .register_table_source("source")
def register_rides_sink(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("TempResults") .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP()) ]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("taxiId", DataTypes.BIGINT()) .field("isStart", DataTypes.BOOLEAN()) .field("lon", DataTypes.FLOAT()) .field("lat", DataTypes.FLOAT()) .field("psgCnt", DataTypes.INT()) .field("rideTime", DataTypes.TIMESTAMP())) \ .in_append_mode() \ .register_table_sink("sink")
def register_source(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("performance_source") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092")) \ .with_format( # declare a format for this system Json() .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())])) .fail_on_missing_field(True)) \ .with_schema( # declare the schema of the table Schema() .field("a", DataTypes.INT())) \ .in_append_mode() \ .create_temporary_table("source")
def register_ride_duration_sink(st_env): st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("TempResults") .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("durationMin", DataTypes.BIGINT()) ]))) \ .with_schema( # declare the schema of the table Schema() .field("rideId", DataTypes.BIGINT()) .field("durationMin", DataTypes.BIGINT())) \ .in_append_mode() \ .register_table_sink("TempResults")
def register_transactions_source(st_env): st_env.connect(Kafka() .version("universal") .topic("server-logs") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092")) \ .with_format(Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("event_id", DataTypes.STRING()), DataTypes.FIELD("account_id", DataTypes.DOUBLE()), DataTypes.FIELD("event_type", DataTypes.DOUBLE()), DataTypes.FIELD("location_country", DataTypes.DOUBLE()), DataTypes.FIELD("event_timestamp", DataTypes.TIMESTAMP(precision=3))]))) \ .with_schema(Schema() .field("event_id", DataTypes.STRING()) .field("account_id", DataTypes.DOUBLE()) .field("event_type", DataTypes.STRING()) .field("location_country", DataTypes.STRING()) .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \ .in_append_mode() \ .create_temporary_table("source")
def pv_uv_demo(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user_behavior") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " user_id: {" " type: 'string'" " }," " item_id: {" " type: 'string'" " }," " category_id: {" " type: 'string'" " }," " behavior: {" " type: 'string'" " }," " ts: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("user_id", DataTypes.STRING()) .field("item_id", DataTypes.STRING()) .field("category_id", DataTypes.STRING()) .field("behavior", DataTypes.STRING()) .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("ts") .watermarks_periodic_bounded(60000)) ) \ .in_append_mode() \ .register_table_source("source") # use custom retract sink connector custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \ .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \ .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \ .property("connector.table", "pv_uv_table") \ .property("connector.write.flush.max-rows", "1") st_env.connect(custom_connector) \ .with_schema( Schema() .field("startTime", DataTypes.TIMESTAMP()) .field("endTime", DataTypes.TIMESTAMP()) .field("pv", DataTypes.BIGINT()) .field("uv", DataTypes.BIGINT()) ).register_table_sink("sink") st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w") \ .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink") st_env.execute("table pv uv")
).in_streaming_mode().build() st_env = StreamTableEnvironment.create(s_env, environment_settings=env_settings) st_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar" ) #读kafka properties = { "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181", "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092", "group.id": "testGroup" } st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \ .with_format(Json()).with_schema(Schema() \ .field('throughputReqMax', DataTypes.BIGINT()) \ .field('throughputReqTotal', DataTypes.BIGINT())) \ .create_temporary_table('mySource') #写入csv st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \ .with_format(OldCsv() .field('sub', DataTypes.BIGINT())) \ .with_schema(Schema() .field('sub', DataTypes.BIGINT())) \ .create_temporary_table('mySink') #读取kafka数据中的a和b字段相加再乘以2 , 并插入sink st_env.from_path('mySource')\
from pyflink.datastream import StreamExecutionEnvironment from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment, StreamTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json, Csv exec_env = StreamExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(Kafka() .version("0.11") .topic("test") .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .in_append_mode() \ .with_format(Csv() .line_delimiter("\r\n") \ .derive_schema()) \ .with_schema(Schema() .field("tbd", DataTypes.INT())) \ .register_table_source('mySource') t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \ .with_format(OldCsv() .field('tbd', DataTypes.INT())) \ .with_schema(Schema() .field("tbd", DataTypes.INT())) \ .register_table_sink('mySink')
if os.path.exists(result_file): os.remove(result_file) # udf @udf(input_types=[DataTypes.DECIMAL(38, 12, nullable=True)], result_type=DataTypes.DECIMAL(38, 12, nullable=True)) def myadd(i): return i * i * 2 st_env.register_function("add", myadd) # way kafka st_env \ .connect( # declare the external system to connect to Kafka() .version("universal") .topic("user") # .start_from_earliest() # .start_from_earliest() .start_from_specific_offset(0,496) .property("zookeeper.connect", "6.86.2.170:2181") .property("bootstrap.servers", "6.86.2.170:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {"
from pyflink.table.window import Tumble if __name__ == '__main__': s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/tumble_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'"
for i, sentence in enumerate(sentences): for no, k in enumerate(sentence.split()[:maxlen][::-1]): x[i, -1 - no] = dic.get(k, UNK) indices = np.argmax(sess.run(Y, feed_dict = {X: x}), axis = 1) return label[indices[0]] st_env.set_python_requirements('/notebooks/requirements.txt') st_env.register_function('predict', predict) st_env.connect( Kafka() .version('universal') .topic('test') .start_from_earliest() .property('zookeeper.connect', 'zookeeper:2181') .property('bootstrap.servers', 'kafka:9092') ).with_format( Json() .fail_on_missing_field(True) .schema( DataTypes.ROW( [ DataTypes.FIELD('datetime', DataTypes.STRING()), DataTypes.FIELD('text', DataTypes.STRING()), ] ) ) ).with_schema( Schema()
s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment\ .create(s_env, environment_settings=EnvironmentSettings .new_instance() .in_streaming_mode() .use_blink_planner().build()) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("Rides") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092")) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .schema(DataTypes.ROW([ DataTypes.FIELD("rideId", DataTypes.BIGINT()), DataTypes.FIELD("isStart", DataTypes.BOOLEAN()), DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()), DataTypes.FIELD("lon", DataTypes.FLOAT()), DataTypes.FIELD("lat", DataTypes.FLOAT()), DataTypes.FIELD("psgCnt", DataTypes.INT()), DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \ .with_schema( # declare the schema of the table Schema()
def distinct_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("Orders") st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("distinct_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ .group_by("a, w").select("a, b.max.distinct as d") result.insert_into("result") st_env.execute("distinct agg streaming")
from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka from pyflink.table.window import Tumble s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) # Stream Table st_env = StreamTableEnvironment.create(s_env) # Set source Kafka table st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("input") .start_from_earliest() .property("zookeeper.connect", "zookeeper:2181") .property("bootstrap.servers", "kafka:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " timestamp: {" " type: 'string'" " }," " page: {" " type: 'string'"
def tumble_time_window_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/tumble_time_window_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env \ .connect( # declare the external system to connect to Kafka() .version("0.11") .topic("user") .start_from_earliest() .property("zookeeper.connect", "localhost:2181") .property("bootstrap.servers", "localhost:9092") ) \ .with_format( # declare a format for this system Json() .fail_on_missing_field(True) .json_schema( "{" " type: 'object'," " properties: {" " a: {" " type: 'string'" " }," " b: {" " type: 'string'" " }," " c: {" " type: 'string'" " }," " time: {" " type: 'string'," " format: 'date-time'" " }" " }" "}" ) ) \ .with_schema( # declare the schema of the table Schema() .field("rowtime", DataTypes.TIMESTAMP()) .rowtime( Rowtime() .timestamps_from_field("time") .watermarks_periodic_bounded(60000)) .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) .field("c", DataTypes.STRING()) ) \ .in_append_mode() \ .register_table_source("source") st_env.register_table_sink("result", CsvTableSink(["a", "b"], [DataTypes.STRING(), DataTypes.STRING()], result_file)) st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ .group_by("w, a") \ .select("a, max(b)").insert_into("result") st_env.execute("tumble time window streaming")
table_env = StreamTableEnvironment.create(env, table_config) from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem directories = ['/flink/lib'] for directory in directories: for jar in glob.glob(os.path.join(directory, '*.jar')): sys.path.append(jar) # from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11 # from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09 OldCsv() print("debug 010") Kafka() print("debug 020") Json() print("debug 030") sourcetable = table_env \ .connect(Kafka() .properties({'update-mode': 'append', 'connector.topic': 'machine.data', 'connector.properties.zookeeper.connect': 'localhost:2181', 'connector.properties.bootstrap.servers.': 'localhost:9092'})) \ .with_format(Json(). json_schema( "{type:'object',properties:{thing: {type: 'string'},quantity:{type:'string'},phenomenonTime:{type:'integer'},result:{type:'number'}}}") \ .fail_on_missing_field(False)) \ .with_schema(Schema()