Example #1
0
def pandas_udaf():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[(Instant.of_epoch_milli(1000), 'Alice', 110.1),
                    (Instant.of_epoch_milli(4000), 'Bob', 30.2),
                    (Instant.of_epoch_milli(3000), 'Alice', 20.0),
                    (Instant.of_epoch_milli(2000), 'Bob', 53.1),
                    (Instant.of_epoch_milli(5000), 'Alice', 13.1),
                    (Instant.of_epoch_milli(3000), 'Bob', 3.1),
                    (Instant.of_epoch_milli(7000), 'Bob', 16.1),
                    (Instant.of_epoch_milli(10000), 'Alice', 20.1)],
        type_info=Types.ROW([Types.INSTANT(),
                             Types.STRING(),
                             Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder().column_by_expression(
            "ts", "CAST(f0 AS TIMESTAMP_LTZ(3))").column(
                "f1",
                DataTypes.STRING()).column("f2", DataTypes.FLOAT()).watermark(
                    "ts", "ts - INTERVAL '3' SECOND").build()).alias(
                        "ts", "name", "price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('name', DataTypes.STRING()).column(
                'total_price', DataTypes.FLOAT()).column(
                    'w_start', DataTypes.TIMESTAMP_LTZ()).column(
                        'w_end', DataTypes.TIMESTAMP_LTZ()).build()).build())

    @udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
    def mean_udaf(v):
        return v.mean()

    # define the tumble window operation
    table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, mean_udaf(table.price), col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Example #2
0
    def test_equals_and_hash(self):

        config1 = StreamExecutionEnvironment.get_execution_environment(
        ).get_config()

        config2 = StreamExecutionEnvironment.get_execution_environment(
        ).get_config()

        self.assertEqual(config1, config2)

        self.assertEqual(hash(config1), hash(config2))

        config1.set_parallelism(12)
        config2.set_parallelism(11)

        self.assertNotEqual(config1, config2)

        self.assertNotEqual(hash(config1), hash(config2))

        config2.set_parallelism(12)

        self.assertEqual(config1, config2)

        self.assertEqual(hash(config1), hash(config2))
 def setUp(self) -> None:
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(1)
     self.env.add_jars("file://{}".format(find_jar_path()))
     self.t_env = StreamTableEnvironment.create(self.env)
     self.source_table = self.t_env.from_descriptor(
         TableDescriptor.for_connector("datagen").schema(
             Schema.new_builder().column("x", DataTypes.INT()).column(
                 "a", DataTypes.INT()).build()).option(
                     "fields.x.kind",
                     "sequence").option("fields.x.start", "1").option(
                         "fields.x.end",
                         "100").option("fields.a.kind", "sequence").option(
                             "fields.a.start",
                             "101").option("fields.a.end", "200").build())
    def test_create_table_environment(self):
        table_config = TableConfig()
        table_config.set_max_generated_code_length(32000)
        table_config.set_null_check(False)
        table_config.set_timezone("Asia/Shanghai")

        env = StreamExecutionEnvironment.get_execution_environment()
        t_env = StreamTableEnvironment.create(env, table_config)

        readed_table_config = t_env.get_config()

        self.assertFalse(readed_table_config.get_null_check())
        self.assertEqual(readed_table_config.get_max_generated_code_length(),
                         32000)
        self.assertEqual(readed_table_config.get_timezone(), "Asia/Shanghai")
Example #5
0
def kafka_to_mysql():
    """
  从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."}
  cp
  """
    settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
                    CREATE TABLE kafka_source (
                        msg STRING
                    ) WITH (
                        'connector' = 'kafka-0.11',
                        'topic' = 'cdn-log',
                        'properties.bootstrap.servers' = 'kafka:9092',
                        'format' = 'json',
                        'scan.startup.mode' = 'latest-offset'
                    )
                    """

    sink_ddl = """
                  CREATE TABLE mysql_sink (
                    msg STRING 
                ) WITH (
                   'connector' = 'jdbc',
                   'url' = 'jdbc:mysql://mysql:3306/flinkdb?characterEncoding=utf-8&useSSL=false',
                   'table-name' = 'cdn_log',
                   'username' = 'root',
                   'password' = '123456',
                   'sink.buffer-flush.max-rows' = '1'
                )
        """

    # 注册source和sink
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    # 数据提取
    tab = t_env.from_path("kafka_source")
    # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
    tab.insert_into("mysql_sink")
    # 执行作业
    t_env.execute("kafka_to_mysql")
Example #6
0
def test_stream():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)

    # t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 1000000)
    # t_env.get_config().get_configuration().set_boolean("table.exec.mini-batch.enabled", True)
    # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.allow-latency", 1000)
    # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.size", 100000)
    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000)
    t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True)

    t_env.create_temporary_function("python_avg", MeanAggregateFunction())
    t_env.create_java_temporary_system_function("java_avg", "com.alibaba.flink.function.JavaAvg")

    num_rows = 10000000

    t_env.execute_sql(f"""
        CREATE TABLE source (
            id INT,
            num INT,
            rowtime TIMESTAMP(3),
            WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
        ) WITH (
          'connector' = 'Range',
          'start' = '1',
          'end' = '{num_rows}',
          'step' = '1',
          'partition' = '200'
        )
    """)
    t_env.register_table_sink(
        "sink",
        PrintTableSink(
            ["num", "value"],
            [DataTypes.INT(False), DataTypes.FLOAT(False)], 1000000))
    #         .group_by("num") \
    # .select("num % 1000 as num, id") \
    result = t_env.from_path("source") \
        .select("num % 1000 as num, id") \
        .group_by("num") \
        .select("num, python_avg(id)")
    result.insert_into("sink")
    beg_time = time.time()
    t_env.execute("Python UDF")
    print("PyFlink stream group agg consume time: " + str(time.time() - beg_time))
Example #7
0
def tumble_window_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[(Instant.of_epoch_milli(1000), 'Alice', 110.1),
                    (Instant.of_epoch_milli(4000), 'Bob', 30.2),
                    (Instant.of_epoch_milli(3000), 'Alice', 20.0),
                    (Instant.of_epoch_milli(2000), 'Bob', 53.1),
                    (Instant.of_epoch_milli(5000), 'Alice', 13.1),
                    (Instant.of_epoch_milli(3000), 'Bob', 3.1),
                    (Instant.of_epoch_milli(7000), 'Bob', 16.1),
                    (Instant.of_epoch_milli(10000), 'Alice', 20.1)],
        type_info=Types.ROW([Types.INSTANT(),
                             Types.STRING(),
                             Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder().column_by_expression(
            "ts", "CAST(f0 AS TIMESTAMP(3))").column(
                "f1",
                DataTypes.STRING()).column("f2", DataTypes.FLOAT()).watermark(
                    "ts", "ts - INTERVAL '3' SECOND").build()).alias(
                        "ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('name', DataTypes.STRING()).column(
                'total_price', DataTypes.FLOAT()).build()).build())

    # define the over window operation
    table = table.over_window(
        Over.partition_by("name")
            .order_by("ts")
            .preceding(row_interval(2))
            .following(CURRENT_ROW)
            .alias('w')) \
        .select(table.name, table.price.max.over(col('w')))

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Example #8
0
    def __init__(self):
        # self.feature_extractor = DemoFeatureExtractor()
        self.settings = EnvironmentSettings.new_instance().in_streaming_mode(
        ).use_blink_planner().build()
        self.env = StreamExecutionEnvironment.get_execution_environment()
        self.env.set_parallelism(1)
        self.table_env = StreamTableEnvironment.create(
            self.env, environment_settings=self.settings)
        self.table_env.get_config().get_configuration().set_boolean(
            "python.fn-execution.memory.managed", True)
        self.table_env.add_python_file('feature_extractors')

        source_table = open('feature_extractors/source.sql', 'r').read()
        sink_table = open('feature_extractors/sink.sql', 'r').read()

        self.table_env.execute_sql(source_table)
        self.table_env.execute_sql(sink_table)
Example #9
0
def run():
    # 获取运行环境
    env = StreamExecutionEnvironment.get_execution_environment()

    # 设置运行环境
    env_setting(env)
    # 设置并行度
    env.set_parallelism(1)
    # 添加jar文件 windows 系统改成自己的jar 所在文件地址
    kafka_jar = f"file://{os.getcwd()}/jars/flink-connector-kafka_2.11-1.12.0.jar"
    kafka_client = f"file://{os.getcwd()}/jars/kafka-clients-2.4.1.jar"
    env.add_jars(kafka_jar, kafka_client)

    # 添加文件
    env.add_python_file(f"{os.getcwd()}/config_file.py")
    env.add_python_file(f"{os.getcwd()}/env_setting.py")

    # 使用打包的运行环境 (自定义环境打包)
    env.add_python_archive(f"{os.getcwd()}/venv.zip")
    env.set_python_executable("env.zip/venv/bin/python")
    # 使用本地运行环境
    # env.set_python_executable(PYTHON_EXECUTABLE)
    env.disable_operator_chaining()

    kafka_product_properties = get_kafka_Producer_properties(TEST_KAFKA_SERVERS)

    properties = get_kafka_customer_properties(TEST_KAFKA_SERVERS, TEST_GROUP_ID)

    data_stream = env.add_source(
        FlinkKafkaConsumer(topics=TEST_KAFKA_TOPIC,
                           properties=properties,
                           deserialization_schema=SimpleStringSchema()) \
            .set_commit_offsets_on_checkpoints(True)
    ) \
        .name(f"消费{TEST_KAFKA_TOPIC}主题数据")

    data_stream.map(lambda value: json.loads(s=value, encoding="utf-8")) \
        .name("转成json") \
        .map(lambda value: json.dumps(value), BasicTypeInfo.STRING_TYPE_INFO()) \
        .name("转成str") \
        .add_sink(FlinkKafkaProducer(topic=TEST_SINK_TOPIC,
                                     producer_config=kafka_product_properties,
                                     serialization_schema=SimpleStringSchema())) \
        .name("存入kafka")

    env.execute("测试pyflink 读取和写入kafka")
Example #10
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    create_kafka_source_ddl = """
                CREATE TABLE payment_msg(
                    createTime VARCHAR,
                    rt as TO_TIMESTAMP(createTime),
                    orderId BIGINT,
                    payAmount DOUBLE,
                    payPlatform INT,
                    provinceId INT,
                    WATERMARK FOR rt as rt - INTERVAL '2' SECOND
                ) WITH (
                  'connector.type' = 'kafka',
                  'connector.version' = 'universal',
                  'connector.topic' = 'timer-stream-source',
                  'connector.properties.bootstrap.servers' = 'localhost:9092',
                  'connector.properties.group.id' = 'test_3',
                  'connector.startup-mode' = 'earliest-offset',
                  'format.type' = 'json'
                )
                """
    t_env.execute_sql(create_kafka_source_ddl)
    t = t_env.from_path("payment_msg").select("createTime, orderId, payAmount, payPlatform,"
                                              " provinceId")
    source_type_info = Types.ROW([
        Types.STRING(),
        Types.LONG(),
        Types.DOUBLE(),
        Types.INT(),
        Types.INT()])
    ds = t_env.to_append_stream(table=t, type_info=source_type_info)
    producer_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'pyflink-e2e-source'}
    kafka_producer = FlinkKafkaProducer("timer-stream-sink", SimpleStringSchema(),
                                        producer_props)
    ds.key_by(MyKeySelector(), key_type_info=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
Example #11
0
def basic_operations():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    # define the source
    ds = env.from_collection(collection=[
        (1,
         '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}'
         ),
        (2,
         '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}'
         ),
        (3,
         '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}'
         ),
        (4,
         '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}'
         )
    ],
                             type_info=Types.ROW_NAMED(
                                 ["id", "info"],
                                 [Types.INT(), Types.STRING()]))

    # map
    def update_tel(data):
        # parse the json
        json_data = json.loads(data.info)
        json_data['tel'] += 1
        return data.id, json.dumps(json_data)

    show(ds.map(update_tel), env)
    # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}')
    # (2, '{"name": "hello", "tel": 136, "addr": {"country": "China", "city": "Shanghai"}}')
    # (3, '{"name": "world", "tel": 125, "addr": {"country": "USA", "city": "NewYork"}}')
    # (4, '{"name": "PyFlink", "tel": 33, "addr": {"country": "China", "city": "Hangzhou"}}')

    # filter
    show(ds.filter(lambda data: data.id == 1).map(update_tel), env)
    # (1, '{"name": "Flink", "tel": 124, "addr": {"country": "Germany", "city": "Berlin"}}')

    # key by
    show(
        ds.map(lambda data: (json.loads(data.info)['addr']['country'],
                             json.loads(data.info)['tel'])).key_by(
                                 lambda data: data[0]).sum(1), env)
Example #12
0
    def create_table_env(self):
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        stream_env.set_stream_time_characteristic(
            TimeCharacteristic.ProcessingTime)
        stream_env.set_parallelism(1)

        t_env = StreamTableEnvironment.create(
            stream_env,
            environment_settings=EnvironmentSettings.new_instance(
            ).in_streaming_mode().use_blink_planner().build())
        statement_set = t_env.create_statement_set()
        t_env.get_config().set_python_executable(execute_path)
        t_env.get_config().get_configuration().set_boolean(
            "python.fn-execution.memory.managed", True)
        t_env.get_config().get_configuration().set_string(
            "taskmanager.memory.task.off-heap.size", '512m')
        t_env.get_config().get_configuration().set_string("rest.port", '8081')
        return stream_env, t_env, statement_set
Example #13
0
File: env.py Project: alibaba/Alink
def usePyFlinkEnv(parallelism: int = None, flinkHome: str = None) -> MLEnv:
    global _mlenv
    if in_custom_env():
        print("Warning: usePyFlinkEnv will do nothing, since useCustomEnv is used to initialize MLEnv.")
        return _mlenv

    resetEnv()
    if flinkHome is not None:
        g_config["flink_home"] = flinkHome

    # Let PyFlink to launch gateway, and warn users to add jars to pyflink lib path
    print("Warning: You're running the script with 'getMLEnv'. "
          "You have to manually add Alink jars to PyFlink lib path to make the script work.")
    import pyflink
    # noinspection PyUnresolvedReferences
    gateway = pyflink.java_gateway.get_gateway()
    # noinspection PyUnresolvedReferences
    pyflink.java_gateway.import_flink_view(gateway)

    # In PyFlink 1.9 and 1.10, PyFlink doesn't start callback server.
    # We start callback server manually.
    success = gateway.start_callback_server(
        callback_server_parameters=CallbackServerParameters(port=0, daemonize=True, daemonize_connections=True))
    if success:
        callback_server_port = gateway.get_callback_server().get_listening_port()
        gateway.java_gateway_server.resetCallbackClient(
            gateway.java_gateway_server.getCallbackClient().getAddress(),
            callback_server_port)

    set_java_gateway(gateway)

    from pyflink.dataset import ExecutionEnvironment
    from pyflink.datastream import StreamExecutionEnvironment

    benv = ExecutionEnvironment.get_execution_environment()
    senv = StreamExecutionEnvironment.get_execution_environment()
    if parallelism is not None:
        benv.set_parallelism(parallelism)
        senv.set_parallelism(parallelism)

    # noinspection PyProtectedMember
    _mlenv = setup_py_mlenv(gateway, benv._j_execution_environment, senv._j_stream_execution_environment)
    return _mlenv
Example #14
0
def left_outer_join_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    left = st_env.from_elements(
        [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
        ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")

    result = left.left_outer_join(right, "a = d").select("a, b, e")
    # use custom retract sink connector
    sink = TestRetractSink(["a", "b", "c"],
                           [DataTypes.BIGINT(),
                            DataTypes.STRING(),
                            DataTypes.STRING()])
    st_env.register_table_sink("sink", sink)
    result.insert_into("sink")
    st_env.execute("left outer join streaming")
Example #15
0
def tutorial():
    env = StreamExecutionEnvironment.get_execution_environment()
    jar_files = (
        'flink-connector-kafka_2.12-1.12.2.jar',
        'kafka-clients-2.4.1.jar',
    )
    jar_paths = tuple('file://' +
                      os.path.abspath(os.path.join(cur_path, jar_file))
                      for jar_file in jar_files)

    env.add_jars(*jar_paths)
    env.add_classpaths(*jar_paths)
    env.set_parallelism(1)

    ds = env.add_source(
        FlinkKafkaConsumer(TOPIC, SimpleStringSchema(), KAFKA_PROPERTIES))

    ds.print()
    env.execute("tutorial_job")
def custom_test_source_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/custom_test_source_demo.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("a", DataTypes.STRING())
    ).register_table_source("source")

    st_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.STRING()], result_file))
    orders = st_env.scan("source")
    orders.insert_into("result")
    st_env.execute("custom test source demo")
Example #17
0
def word_count(input_path, output_path):
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_runtime_mode(RuntimeExecutionMode.BATCH)
    # write all the data to one file
    env.set_parallelism(1)

    # define the source
    if input_path is not None:
        ds = env.from_source(
            source=FileSource.for_record_stream_format(
                StreamFormat.text_line_format(),
                input_path).process_static_file_set().build(),
            watermark_strategy=WatermarkStrategy.for_monotonous_timestamps(),
            source_name="file_source")
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        ds = env.from_collection(word_count_data)

    def split(line):
        yield from line.split()

    # compute word count
    ds = ds.flat_map(split) \
           .map(lambda i: (i, 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
           .key_by(lambda i: i[0]) \
           .reduce(lambda i, j: (i[0], i[1] + j[1]))

    # define the sink
    if output_path is not None:
        ds.sink_to(sink=FileSink.for_row_format(
            base_path=output_path, encoder=Encoder.simple_string_encoder()
        ).with_output_file_config(OutputFileConfig.builder().with_part_prefix(
            "prefix").with_part_suffix(".ext").build()).with_rolling_policy(
                RollingPolicy.default_rolling_policy()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        ds.print()

    # submit for execution
    env.execute()
Example #18
0
def main(args):
    func = args[1]
    version = args[2]
    index_name = '_'.join(["performance_pyflink", func, version])
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)

    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000)
    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000)
    t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True)
    t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)

    # t_env.register_table_sink(
    #     "sink",
    #     PrintTableSink(
    #         ["id"],
    #         [DataTypes.INT(False)]))

    @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False))
    def inc(x):
        return x + 1

    t_env.register_function("inc", inc)
    t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc")
    register_source(t_env)
    register_sink(t_env, index_name)

    source = t_env.from_path("source")

    if func == 'java':
        table = source.select("java_inc(a) as a")
    else:
        table = source.select("inc(a) as a")

    table.filter("a % 1000000 = 0") \
        .insert_into("sink")

    beg_time = time.time()
    t_env.execute("Python UDF")
    print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
Example #19
0
def ride_duration():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)

    # use blink table planner
    st_env = StreamTableEnvironment \
        .create(s_env, environment_settings=EnvironmentSettings
                .new_instance()
                .in_streaming_mode()
                .use_blink_planner().build())

    # register source and sink
    register_rides_source(st_env)
    register_ride_duration_sink(st_env)

    # register java udf (isInNYC, timeDiff)
    # 注:timeDiff对应类的路径是:com.ververica.sql_training.udfs.TimeDiff
    st_env.register_java_function("isInNYC", "com.ververica.sql_training.udfs.IsInNYC")
    ??? #注册timeDiff函数
Example #20
0
def mixing_use_of_datastream_and_table():
    # use StreamTableEnvironment instead of TableEnvironment when mixing use of table & datastream
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source
    t_env.create_temporary_table(
        'source',
        TableDescriptor.for_connector('datagen').schema(
            Schema.new_builder().column('id', DataTypes.BIGINT()).column(
                'data',
                DataTypes.STRING()).build()).option("number-of-rows",
                                                    "10").build())

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print').schema(
            Schema.new_builder().column('a',
                                        DataTypes.BIGINT()).build()).build())

    @udf(result_type=DataTypes.BIGINT())
    def length(data):
        return len(data)

    # perform table api operations
    table = t_env.from_path("source")
    table = table.select(col('id'), length(col('data')))

    # convert table to datastream and perform datastream api operations
    ds = t_env.to_data_stream(table)
    ds = ds.map(lambda i: i[0] + i[1], output_type=Types.LONG())

    # convert datastream to table and perform table api operations as you want
    table = t_env.from_data_stream(
        ds,
        Schema.new_builder().column("f0", DataTypes.BIGINT()).build())

    # execute
    table.execute_insert('sink') \
         .wait()
Example #21
0
def distinct_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))

    orders = st_env.scan("Orders")
    result = orders.select("a, b").distinct()
    # use custom retract sink connector
    sink = TestRetractSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()])
    st_env.register_table_sink("sink", sink)
    result.insert_into("sink")
    st_env.execute("distinct streaming")
Example #22
0
def from_kafka_to_kafka_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    s_env.set_parallelism(1)

    # use blink table planner
    st_env = StreamTableEnvironment \
        .create(s_env, environment_settings=EnvironmentSettings
                .new_instance()
                .in_streaming_mode()
                .use_blink_planner().build())

    # register source and sink
    register_rides_source(st_env)
    register_rides_sink(st_env)

    # query
    st_env.from_path("source").select("*").insert_into("sink")

    # execute
    st_env.execute("from_kafka_to_kafka")
Example #23
0
def popular_destination_query():
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    t_env.execute_sql(
        create_table_ddl(
            "WATERMARK FOR pickupTime AS pickupTime - INTERVAL '30' SECONDS"))

    query = f"""SELECT 
    destLocationId, wstart, wend, cnt 
FROM 
    (SELECT 
        destLocationId, 
        HOP_START(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wstart, 
        HOP_END(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE) AS wend, 
        COUNT(destLocationId) AS cnt 
    FROM
        (SELECT 
            pickupTime, 
            destLocationId 
        FROM TaxiRide) 
    GROUP BY
        destLocationId, HOP(pickupTime, INTERVAL '5' MINUTE, INTERVAL '15' MINUTE)
    )
WHERE cnt > {args.threshold}
"""

    results = t_env.sql_query(query)

    t_env.to_append_stream(
        results,
        Types.ROW_NAMED(['destLocationId', 'wstart', 'wend', 'cnt'], [
            Types.INT(),
            Types.SQL_TIMESTAMP(),
            Types.SQL_TIMESTAMP(),
            Types.LONG()
        ])).print()

    env.execute('Popular-Destination')
Example #24
0
def data_transfer():

    input_file = sys.argv[1]
    test_size = sys.argv[2]
    run_num = sys.argv[3]
    file_string = 'file:///home/tito/workspace/inputs/' + str(input_file)
    perf_file = './perf/' + str(test_size) + '/perf_' + str(run_num) + '.csv'
    start = time.time()

    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    ds = env.read_text_file(file_string , 'UTF-8')
    ds.add_sink(StreamingFileSink
                .for_row_format('/home/tito/workspace/outputs', SimpleStringEncoder())
                .build())
    
    env.execute('data_transfer_job')
    
    end = time.time()
    perf_file = open(perf_file, 'w+')
    perf_file.write(f'{start},{end}\n') 
def log_processing():
    env = StreamExecutionEnvironment.get_execution_environment()
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string("pipeline.jars",
                                                      "file://" + FAT_JAR_PATH)

    source_ddl = """
            CREATE TABLE source_table(
                a VARCHAR,
                b INT
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'source_topic',
              'properties.bootstrap.servers' = 'localhost:9092',
              'properties.group.id' = 'test_group',
              'scan.startup.mode' = 'earliest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                a VARCHAR
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'sink_topic',
              'properties.bootstrap.servers' = 'localhost:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    t_env.from_path("source_table").select("a").execute_insert(
        "sink_table").wait()
Example #26
0
def in_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    left = st_env.from_elements(
        [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
        ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
                                 ["a", "b", "c"]).select("a")

    result = left.where("a.in(%s)" % right).select("b, c")
    # another way
    # st_env.register_table("RightTable", right)
    # result = left.where("a.in(RightTable)")

    # use custom retract sink connector
    sink = TestRetractSink(["a", "b"],
                           [DataTypes.STRING(),
                            DataTypes.STRING()])
    st_env.register_table_sink("sink", sink)
    result.insert_into("sink")
    st_env.execute("in streaming")
Example #27
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()

    source_type_info = Types.ROW([Types.STRING(), Types.INT()])
    json_row_deserialization_schema = JsonRowDeserializationSchema.builder()\
        .type_info(source_type_info).build()
    source_topic = 'test-python-data-stream-source'
    consumer_props = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'pyflink-e2e-source'
    }
    kafka_consumer_1 = FlinkKafkaConsumer(source_topic,
                                          json_row_deserialization_schema,
                                          consumer_props)
    kafka_consumer_1.set_start_from_earliest()
    source_stream_1 = env.add_source(kafka_consumer_1).name('kafka source 1')
    mapped_type_info = Types.ROW([Types.STRING(), Types.INT(), Types.INT()])

    keyed_stream = source_stream_1.map(add_one, output_type=mapped_type_info) \
        .key_by(lambda x: x[2])

    flat_mapped_stream = keyed_stream.flat_map(m_flat_map,
                                               result_type=mapped_type_info)
    flat_mapped_stream.name("flat-map").set_parallelism(3)

    sink_topic = 'test-python-data-stream-sink'
    producer_props = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'pyflink-e2e-1'
    }
    json_row_serialization_schema = JsonRowSerializationSchema.builder()\
        .with_type_info(mapped_type_info).build()
    kafka_producer = FlinkKafkaProducer(
        topic=sink_topic,
        producer_config=producer_props,
        serialization_schema=json_row_serialization_schema)
    flat_mapped_stream.add_sink(kafka_producer)
    env.execute_async("test data stream to kafka")
Example #28
0
def python_data_stream_example():
    env = StreamExecutionEnvironment.get_execution_environment()
    # Set the parallelism to be one to make sure that all data including fired timer and normal data
    # are processed by the same worker and the collected result would be in order which is good for
    # assertion.
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    type_info = Types.ROW_NAMED(
        ['createTime', 'orderId', 'payAmount', 'payPlatform', 'provinceId'],
        [Types.LONG(),
         Types.LONG(),
         Types.DOUBLE(),
         Types.INT(),
         Types.INT()])
    json_row_schema = JsonRowDeserializationSchema.builder().type_info(
        type_info).build()
    kafka_props = {
        'bootstrap.servers': 'localhost:9092',
        'group.id': 'pyflink-e2e-source'
    }

    kafka_consumer = FlinkKafkaConsumer("timer-stream-source", json_row_schema,
                                        kafka_props)
    kafka_producer = FlinkKafkaProducer("timer-stream-sink",
                                        SimpleStringSchema(), kafka_props)

    watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(Duration.of_seconds(5))\
        .with_timestamp_assigner(KafkaRowTimestampAssigner())

    kafka_consumer.set_start_from_earliest()
    ds = env.add_source(kafka_consumer).assign_timestamps_and_watermarks(
        watermark_strategy)
    ds.key_by(MyKeySelector(), key_type=Types.LONG()) \
        .process(MyProcessFunction(), output_type=Types.STRING()) \
        .add_sink(kafka_producer)
    env.execute_async("test data stream timer")
def custom_test_sink_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")

    result = left.left_outer_join(right, "a = d").select("a, b, e")
    # use custom retract sink connector
    custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("a", DataTypes.BIGINT())
            .field("b", DataTypes.STRING())
            .field("c", DataTypes.STRING())
    ).register_table_sink("sink")
    result.insert_into("sink")
    st_env.execute("custom test sink demo")
def run_consumer(output_path):
    env = StreamExecutionEnvironment.get_execution_environment()
    # write all the data to one file
    env.set_parallelism(1)

    # get the credit card data
    dataset = datasets.CreditCard()

    # create a small collection of items
    i = 0
    num_of_items = 2000
    items = []
    for x, y in dataset:
        if i == num_of_items:
            break
        i += 1
        items.append((json.dumps(x), y))

    credit_stream = env.from_collection(collection=items,
                                        type_info=Types.ROW(
                                            [Types.STRING(),
                                             Types.STRING()]))

    # detect fraud in transactions
    fraud_data = credit_stream.map(lambda data: \
        json.dumps(requests.post('http://localhost:9000/predict', \
                        json={'x': data[0], 'y': data[1]}).json()), \
                        output_type=Types.STRING())

    # save the results to a file
    fraud_data.sink_to(sink=FileSink.for_row_format(
        base_path=output_path,
        encoder=Encoder.simple_string_encoder()).build())

    # submit for execution
    env.execute()