Exemple #1
0
    def test_custom_connector(self):
        custom_connector = CustomConnectorDescriptor('kafka', 1, True) \
            .property('connector.topic', 'topic1')\
            .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'})

        properties = custom_connector.to_properties()
        expected = {'connector.type': 'kafka',
                    'connector.property-version': '1',
                    'connector.topic': 'topic1',
                    'connector.version': '0.11',
                    'connector.startup-mode': 'earliest-offset'}
        self.assertEqual(expected, properties)
def custom_test_source_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/custom_test_source_demo.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("a", DataTypes.STRING())
    ).register_table_source("source")

    st_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.STRING()], result_file))
    orders = st_env.scan("source")
    orders.insert_into("result")
    st_env.execute("custom test source demo")
def custom_test_sink_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")

    result = left.left_outer_join(right, "a = d").select("a, b, e")
    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("a", DataTypes.BIGINT())
            .field("b", DataTypes.STRING())
            .field("c", DataTypes.STRING())
    ).register_table_sink("sink")
    result.insert_into("sink")
    st_env.execute("custom test sink demo")
def custom_kafka_source_demo():
    custom_connector = CustomConnectorDescriptor('kafka', 1, True) \
        .property('connector.topic', 'user') \
        .property('connector.properties.0.key', 'zookeeper.connect') \
        .property('connector.properties.0.value', 'localhost:2181') \
        .property('connector.properties.1.key', 'bootstrap.servers') \
        .property('connector.properties.1.value', 'localhost:9092') \
        .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'})

    # the key is 'format.json-schema'
    custom_format = CustomFormatDescriptor('json', 1) \
        .property('format.json-schema',
                  "{"
                  "  type: 'object',"
                  "  properties: {"
                  "    a: {"
                  "      type: 'string'"
                  "    },"
                  "    b: {"
                  "      type: 'string'"
                  "    },"
                  "    c: {"
                  "      type: 'string'"
                  "    },"
                  "    time: {"
                  "      type: 'string',"
                  "      format: 'date-time'"
                  "    }"
                  "  }"
                  "}") \
        .properties({'format.fail-on-missing-field': 'true'})

    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/custom_kafka_source_demo.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(custom_connector) \
        .with_format(
            custom_format
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("proctime", DataTypes.TIMESTAMP())
            .proctime()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
            .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.STRING()], result_file))

    st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("custom kafka source demo")
Exemple #5
0
def pv_uv_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user_behavior")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    user_id: {"
                "      type: 'string'"
                "    },"
                "    item_id: {"
                "      type: 'string'"
                "    },"
                "    category_id: {"
                "      type: 'string'"
                "    },"
                "    behavior: {"
                "      type: 'string'"
                "    },"
                "    ts: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
            )
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("user_id", DataTypes.STRING())
            .field("item_id", DataTypes.STRING())
            .field("category_id", DataTypes.STRING())
            .field("behavior", DataTypes.STRING())
            .field("rowtime", DataTypes.TIMESTAMP())
            .rowtime(
                Rowtime()
                .timestamps_from_field("ts")
                .watermarks_periodic_bounded(60000))
         ) \
        .in_append_mode() \
        .register_table_source("source")

    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \
        .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \
        .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \
        .property("connector.table", "pv_uv_table") \
        .property("connector.write.flush.max-rows", "1")
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("startTime", DataTypes.TIMESTAMP())
            .field("endTime", DataTypes.TIMESTAMP())
            .field("pv", DataTypes.BIGINT())
            .field("uv", DataTypes.BIGINT())
    ).register_table_sink("sink")

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w") \
        .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink")

    st_env.execute("table pv uv")
Exemple #6
0
import os

from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
from pyflink.table import StreamTableEnvironment, DataTypes, CsvTableSink
from pyflink.table.descriptors import Schema, CustomFormatDescriptor, CustomConnectorDescriptor, Json
from pyflink.table.window import Tumble

if __name__ == '__main__':
    custom_connector = CustomConnectorDescriptor('kafka', 1, True) \
        .property('connector.topic', 'user') \
        .property('connector.properties.0.key', 'zookeeper.connect') \
        .property('connector.properties.0.value', 'localhost:2181') \
        .property('connector.properties.1.key', 'bootstrap.servers') \
        .property('connector.properties.1.value', 'localhost:9092') \
        .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'})

    # the key is 'format.json-schema'
    custom_format = CustomFormatDescriptor('json', 1) \
        .property('format.json-schema',
                  "{"
                  "  type: 'object',"
                  "  properties: {"
                  "    a: {"
                  "      type: 'string'"
                  "    },"
                  "    b: {"
                  "      type: 'string'"
                  "    },"
                  "    c: {"
                  "      type: 'string'"
                  "    },"