Esempio n. 1
0
    def test_topic(self):
        kafka = Kafka().topic("topic1")

        properties = kafka.to_properties()
        expected = {
            'connector.type': 'kafka',
            'connector.topic': 'topic1',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 2
0
    def test_property(self):
        kafka = Kafka().property("group.id", "testGroup")

        properties = kafka.to_properties()
        expected = {
            'connector.type': 'kafka',
            'connector.properties.group.id': 'testGroup',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 3
0
    def test_start_from_group_offsets(self):
        kafka = Kafka().start_from_group_offsets()

        properties = kafka.to_properties()
        expected = {
            'connector.type': 'kafka',
            'connector.startup-mode': 'group-offsets',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 4
0
    def test_sink_partitioner_round_robin(self):
        kafka = Kafka().sink_partitioner_round_robin()

        properties = kafka.to_properties()
        expected = {
            'connector.sink-partitioner': 'round-robin',
            'connector.type': 'kafka',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 5
0
    def test_version(self):
        kafka = Kafka().version("0.11")

        properties = kafka.to_properties()
        expected = {
            'connector.version': '0.11',
            'connector.type': 'kafka',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 6
0
    def test_start_from_specific_offset(self):
        kafka = Kafka().start_from_specific_offset(3, 300)

        properties = kafka.to_properties()
        expected = {
            'connector.startup-mode': 'specific-offsets',
            'connector.specific-offsets': 'partition:3,offset:300',
            'connector.type': 'kafka',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 7
0
    def test_properties(self):
        kafka = Kafka().properties({"bootstrap.servers": "localhost:9092"})

        properties = kafka.to_properties()
        expected = {
            'connector.type': 'kafka',
            'connector.startup-mode': 'group-offsets',
            'connector.properties.bootstrap.servers': 'localhost:9092',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 8
0
    def test_sink_partitioner_fixed(self):
        kafka = Kafka().sink_partitioner_fixed()

        properties = kafka.to_properties()
        expected = {
            'connector.sink-partitioner': 'fixed',
            'connector.startup-mode': 'group-offsets',
            'connector.type': 'kafka',
            'connector.property-version': '1'
        }
        self.assertEqual(expected, properties)
Esempio n. 9
0
    def test_start_from_specific_offsets(self):
        kafka = Kafka().start_from_specific_offsets({1: 220, 3: 400})

        properties = kafka.to_properties()
        expected = {'connector.startup-mode': 'specific-offsets',
                    'connector.specific-offsets.0.partition': '1',
                    'connector.specific-offsets.0.offset': '220',
                    'connector.specific-offsets.1.partition': '3',
                    'connector.specific-offsets.1.offset': '400',
                    'connector.type': 'kafka',
                    'connector.property-version': '1'}
        self.assertEqual(expected, properties)
Esempio n. 10
0
    def test_properties(self):
        kafka = Kafka().properties({"zookeeper.connect": "localhost:2181",
                                    "bootstrap.servers": "localhost:9092"})

        properties = kafka.to_properties()
        expected = {'connector.type': 'kafka',
                    'connector.properties.0.key': 'zookeeper.connect',
                    'connector.properties.0.value': 'localhost:2181',
                    'connector.properties.1.key': 'bootstrap.servers',
                    'connector.properties.1.value': 'localhost:9092',
                    'connector.property-version': '1'}
        self.assertEqual(expected, properties)
Esempio n. 11
0
    def test_sink_partitioner_custom(self):
        kafka = Kafka().sink_partitioner_custom(
            "org.apache.flink.streaming.connectors.kafka.partitioner.FlinkFixedPartitioner")

        properties = kafka.to_properties()
        expected = {'connector.sink-partitioner': 'custom',
                    'connector.sink-partitioner-class':
                        'org.apache.flink.streaming.connectors.kafka.partitioner.'
                        'FlinkFixedPartitioner',
                    'connector.type': 'kafka',
                    'connector.property-version': '1'}
        self.assertEqual(expected, properties)
Esempio n. 12
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())
            .rowtime(
            Rowtime()
                .timestamps_from_field("eventTime")
                .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Esempio n. 13
0
def register_rides_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("universal")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("eventTime", DataTypes.STRING()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("eventTime", DataTypes.STRING())) \
        .in_append_mode() \
        .create_temporary_table("source")
Esempio n. 14
0
def register_transactions_source(st_env):
    st_env.connect(Kafka()
                   .version("universal")
                   .topic("transactions-data")
                   .start_from_latest()
                   .property("zookeeper.connect", "host.docker.internal:2181")
                   .property("bootstrap.servers", "host.docker.internal:19091")) \
        .with_format(Json()
        .fail_on_missing_field(True)
        .schema(DataTypes.ROW([
        DataTypes.FIELD("customer", DataTypes.STRING()),
        DataTypes.FIELD("transaction_type", DataTypes.STRING()),
        DataTypes.FIELD("online_payment_amount", DataTypes.DOUBLE()),
        DataTypes.FIELD("in_store_payment_amount", DataTypes.DOUBLE()),
        DataTypes.FIELD("lat", DataTypes.DOUBLE()),
        DataTypes.FIELD("lon", DataTypes.DOUBLE()),
        DataTypes.FIELD("transaction_datetime", DataTypes.TIMESTAMP())]))) \
        .with_schema(Schema()
        .field("customer", DataTypes.STRING())
        .field("transaction_type", DataTypes.STRING())
        .field("online_payment_amount", DataTypes.DOUBLE())
        .field("in_store_payment_amount", DataTypes.DOUBLE())
        .field("lat", DataTypes.DOUBLE())
        .field("lon", DataTypes.DOUBLE())
        .field("rowtime", DataTypes.TIMESTAMP())
        .rowtime(
        Rowtime()
            .timestamps_from_field("transaction_datetime")
            .watermarks_periodic_bounded(60000))) \
        .in_append_mode() \
        .register_table_source("source")
Esempio n. 15
0
def register_rides_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("taxiId", DataTypes.BIGINT()),
            DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
            DataTypes.FIELD("lon", DataTypes.FLOAT()),
            DataTypes.FIELD("lat", DataTypes.FLOAT()),
            DataTypes.FIELD("psgCnt", DataTypes.INT()),
            DataTypes.FIELD("rideTime", DataTypes.TIMESTAMP())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("taxiId", DataTypes.BIGINT())
            .field("isStart", DataTypes.BOOLEAN())
            .field("lon", DataTypes.FLOAT())
            .field("lat", DataTypes.FLOAT())
            .field("psgCnt", DataTypes.INT())
            .field("rideTime", DataTypes.TIMESTAMP())) \
        .in_append_mode() \
        .register_table_sink("sink")
Esempio n. 16
0
def register_source(st_env):
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("performance_source")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")) \
        .with_format(  # declare a format for this system
            Json()
            .schema(DataTypes.ROW([DataTypes.FIELD("a", DataTypes.INT())]))
            .fail_on_missing_field(True)) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("a", DataTypes.INT())) \
        .in_append_mode() \
        .create_temporary_table("source")
Esempio n. 17
0
def register_ride_duration_sink(st_env):
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("TempResults")
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
            DataTypes.FIELD("rideId", DataTypes.BIGINT()),
            DataTypes.FIELD("durationMin", DataTypes.BIGINT())
        ]))) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("rideId", DataTypes.BIGINT())
            .field("durationMin", DataTypes.BIGINT())) \
        .in_append_mode() \
        .register_table_sink("TempResults")
def register_transactions_source(st_env):
    st_env.connect(Kafka()
                   .version("universal")
                   .topic("server-logs")
                   .start_from_earliest()
                   .property("zookeeper.connect", "localhost:2181")
                   .property("bootstrap.servers", "localhost:9092")) \
        .with_format(Json()
        .fail_on_missing_field(True)
        .schema(DataTypes.ROW([
        DataTypes.FIELD("event_id", DataTypes.STRING()),
        DataTypes.FIELD("account_id", DataTypes.DOUBLE()),
        DataTypes.FIELD("event_type", DataTypes.DOUBLE()),
        DataTypes.FIELD("location_country", DataTypes.DOUBLE()),
        DataTypes.FIELD("event_timestamp", DataTypes.TIMESTAMP(precision=3))]))) \
        .with_schema(Schema()
        .field("event_id", DataTypes.STRING())
        .field("account_id", DataTypes.DOUBLE())
        .field("event_type", DataTypes.STRING())
        .field("location_country", DataTypes.STRING())
        .field("event_timestamp", DataTypes.TIMESTAMP(precision=3))) \
        .in_append_mode() \
        .create_temporary_table("source")
Esempio n. 19
0
def pv_uv_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user_behavior")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    user_id: {"
                "      type: 'string'"
                "    },"
                "    item_id: {"
                "      type: 'string'"
                "    },"
                "    category_id: {"
                "      type: 'string'"
                "    },"
                "    behavior: {"
                "      type: 'string'"
                "    },"
                "    ts: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
            )
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("user_id", DataTypes.STRING())
            .field("item_id", DataTypes.STRING())
            .field("category_id", DataTypes.STRING())
            .field("behavior", DataTypes.STRING())
            .field("rowtime", DataTypes.TIMESTAMP())
            .rowtime(
                Rowtime()
                .timestamps_from_field("ts")
                .watermarks_periodic_bounded(60000))
         ) \
        .in_append_mode() \
        .register_table_source("source")

    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \
        .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \
        .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \
        .property("connector.table", "pv_uv_table") \
        .property("connector.write.flush.max-rows", "1")
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("startTime", DataTypes.TIMESTAMP())
            .field("endTime", DataTypes.TIMESTAMP())
            .field("pv", DataTypes.BIGINT())
            .field("uv", DataTypes.BIGINT())
    ).register_table_sink("sink")

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w") \
        .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink")

    st_env.execute("table pv uv")
Esempio n. 20
0
).in_streaming_mode().build()

st_env = StreamTableEnvironment.create(s_env,
                                       environment_settings=env_settings)
st_env.get_config().get_configuration().set_string(
    "pipeline.jars",
    "file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-connector-kafka-base_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-jdbc_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/flink-sql-connector-kafka_2.11-1.11.0.jar;file:///root/anaconda3/lib/python3.6/site-packages/pyflink/lib/kafka-clients-2.1.0.jar"
)

#读kafka
properties = {
    "zookeeper.connect": "nn1.hadoop:2181,nn2.hadoop:2181,s1.hadoop:2181",
    "bootstrap.servers": "nn1.hadoop:9092,nn2.hadoop:9092,s1.hadoop:9092",
    "group.id": "testGroup"
}
st_env.connect(Kafka().properties(properties).version("universal").topic("test").start_from_latest()) \
    .with_format(Json()).with_schema(Schema() \
        .field('throughputReqMax', DataTypes.BIGINT()) \
        .field('throughputReqTotal', DataTypes.BIGINT())) \
    .create_temporary_table('mySource')

#写入csv
st_env.connect(FileSystem().path('/usr/local/flink/test/result3.txt')) \
    .with_format(OldCsv()
                .field('sub', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('sub', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

#读取kafka数据中的a和b字段相加再乘以2 , 并插入sink
st_env.from_path('mySource')\
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment, StreamTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json, Csv

exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)

t_env.connect(Kafka()
              .version("0.11")
              .topic("test")
              .property("zookeeper.connect", "localhost:2181")
              .property("bootstrap.servers", "localhost:9092")
              ) \
    .in_append_mode() \
    .with_format(Csv()
                 .line_delimiter("\r\n")      \
                 .derive_schema()) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_source('mySource')

t_env.connect(FileSystem().path('../production_data/kafkaoutput')) \
    .with_format(OldCsv()
                 .field('tbd', DataTypes.INT())) \
    .with_schema(Schema()
                 .field("tbd", DataTypes.INT())) \
    .register_table_sink('mySink')
Esempio n. 22
0
    if os.path.exists(result_file):
        os.remove(result_file)

    # udf
    @udf(input_types=[DataTypes.DECIMAL(38, 12, nullable=True)], result_type=DataTypes.DECIMAL(38, 12, nullable=True))
    def myadd(i):
        return i * i * 2
    st_env.register_function("add", myadd)

    # way kafka
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("universal")
            .topic("user")
            # .start_from_earliest()
            # .start_from_earliest()
            .start_from_specific_offset(0,496)
            .property("zookeeper.connect", "6.86.2.170:2181")
            .property("bootstrap.servers", "6.86.2.170:9092")
    ) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .json_schema(
            "{"
            "  type: 'object',"
            "  properties: {"
            "    a: {"
            "      type: 'string'"
            "    },"
            "    b: {"
Esempio n. 23
0
from pyflink.table.window import Tumble

if __name__ == '__main__':

    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/tumble_time_window_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
    ) \
        .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .json_schema(
            "{"
            "  type: 'object',"
            "  properties: {"
            "    a: {"
            "      type: 'string'"
            "    },"
            "    b: {"
            "      type: 'string'"
    for i, sentence in enumerate(sentences):
        for no, k in enumerate(sentence.split()[:maxlen][::-1]):
            x[i, -1 - no] = dic.get(k, UNK)
    indices = np.argmax(sess.run(Y, feed_dict = {X: x}), axis = 1)
    return label[indices[0]]


st_env.set_python_requirements('/notebooks/requirements.txt')

st_env.register_function('predict', predict)


st_env.connect(
    Kafka()
    .version('universal')
    .topic('test')
    .start_from_earliest()
    .property('zookeeper.connect', 'zookeeper:2181')
    .property('bootstrap.servers', 'kafka:9092')
).with_format(
    Json()
    .fail_on_missing_field(True)
    .schema(
        DataTypes.ROW(
            [
                DataTypes.FIELD('datetime', DataTypes.STRING()),
                DataTypes.FIELD('text', DataTypes.STRING()),
            ]
        )
    )
).with_schema(
    Schema()
s_env = StreamExecutionEnvironment.get_execution_environment()
s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
s_env.set_parallelism(1)

# use blink table planner
st_env = StreamTableEnvironment\
    .create(s_env, environment_settings=EnvironmentSettings
            .new_instance()
            .in_streaming_mode()
            .use_blink_planner().build())

st_env \
    .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("Rides")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")) \
    .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .schema(DataTypes.ROW([
                DataTypes.FIELD("rideId", DataTypes.BIGINT()),
                DataTypes.FIELD("isStart", DataTypes.BOOLEAN()),
                DataTypes.FIELD("eventTime", DataTypes.TIMESTAMP()),
                DataTypes.FIELD("lon", DataTypes.FLOAT()),
                DataTypes.FIELD("lat", DataTypes.FLOAT()),
                DataTypes.FIELD("psgCnt", DataTypes.INT()),
                DataTypes.FIELD("taxiId", DataTypes.BIGINT())]))) \
    .with_schema(  # declare the schema of the table
        Schema()
Esempio n. 26
0
def distinct_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("Orders")
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("distinct_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")
    orders = st_env.scan("Orders")
    result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \
        .group_by("a, w").select("a, b.max.distinct as d")
    result.insert_into("result")
    st_env.execute("distinct agg streaming")
from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka
from pyflink.table.window import Tumble

s_env = StreamExecutionEnvironment.get_execution_environment()
s_env.set_parallelism(1)
s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

# Stream Table
st_env = StreamTableEnvironment.create(s_env)

# Set source Kafka table
st_env \
    .connect(  # declare the external system to connect to
        Kafka()
            .version("0.11")
            .topic("input")
            .start_from_earliest()
            .property("zookeeper.connect", "zookeeper:2181")
            .property("bootstrap.servers", "kafka:9092")
    ) \
    .with_format(  # declare a format for this system
        Json()
            .fail_on_missing_field(True)
            .json_schema(
            "{"
            "  type: 'object',"
            "  properties: {"
            "    timestamp: {"
            "      type: 'string'"
            "    },"
            "    page: {"
            "      type: 'string'"
Esempio n. 28
0
def tumble_time_window_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/tumble_time_window_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file))

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("tumble time window streaming")
Esempio n. 29
0
table_env = StreamTableEnvironment.create(env, table_config)

from pyflink.table.descriptors import Kafka, Json, OldCsv, Schema, FileSystem

directories = ['/flink/lib']
for directory in directories:
    for jar in glob.glob(os.path.join(directory, '*.jar')):
        sys.path.append(jar)

# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer11
# from org.apache.flink.streaming.connectors.kafka import FlinkKafkaConsumer09

OldCsv()
print("debug 010")

Kafka()
print("debug 020")
Json()
print("debug 030")


sourcetable = table_env \
    .connect(Kafka()
             .properties({'update-mode': 'append', 'connector.topic': 'machine.data',
                          'connector.properties.zookeeper.connect': 'localhost:2181',
                          'connector.properties.bootstrap.servers.': 'localhost:9092'})) \
    .with_format(Json().
                 json_schema(
    "{type:'object',properties:{thing: {type: 'string'},quantity:{type:'string'},phenomenonTime:{type:'integer'},result:{type:'number'}}}") \
                .fail_on_missing_field(False)) \
    .with_schema(Schema()