Beispiel #1
0
    def test_create_table_environment_with_blink_planner(self):
        t_env = StreamTableEnvironment.create(
            self.env,
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().build())

        planner = t_env._j_tenv.getPlanner()

        self.assertEqual(
            planner.getClass().getName(),
            "org.apache.flink.table.planner.delegation.StreamPlanner")

        t_env = StreamTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance().build())

        planner = t_env._j_tenv.getPlanner()

        self.assertEqual(
            planner.getClass().getName(),
            "org.apache.flink.table.planner.delegation.StreamPlanner")

        t_env = StreamTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).use_old_planner().build())

        planner = t_env._j_tenv.getPlanner()

        self.assertEqual(planner.getClass().getName(),
                         "org.apache.flink.table.planner.StreamPlanner")
Beispiel #2
0
 def test_create_table_environment_with_old_planner(self):
     t_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance().in_batch_mode()
         .use_old_planner().build())
     self.assertEqual(
         t_env._j_tenv.getClass().getName(),
         "org.apache.flink.table.api.bridge.java.internal.BatchTableEnvironmentImpl")
Beispiel #3
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)

    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000)
    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000)
    t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True)

    t_env.register_table_sink(
        "sink",
        PrintTableSink(
            ["id"],
            [DataTypes.INT(False)]))

    @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False))
    def inc(x):
        return x + 1

    t_env.register_function("inc", inc)
    t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc")

    num_rows = 100000000
    t_env.from_table_source(RangeTableSource(1, num_rows, 1)).alias("id") \
        .select("inc(id)") \
        .insert_into("sink")

    beg_time = time.time()
    t_env.execute("Python UDF")
    print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
Beispiel #4
0
 def _local_execute_func(exec_func, write_func, pickle_func, python_path):
     table_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).use_blink_planner().in_batch_mode().build())
     table_env.get_config().get_configuration().set_string(
         'parallelism.default', '1')
     table_env.get_config().set_python_executable(python_path)
     table_env.register_function(
         exec_func,
         udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING()))
     table_env.connect(FileSystem().path(write_func)) \
         .with_format(OldCsv().field('func', DataTypes.STRING())) \
         .with_schema(Schema().field('func', DataTypes.STRING())) \
         .create_temporary_table(exec_func)
     table = table_env.from_elements([(1, 'Joblib')])
     table.select('{}(_1)'.format(exec_func)).insert_into(exec_func)
     table_env.execute(exec_func)
     # decode execution result from table sink file.
     execute_result = cloudpickle.loads(
         codecs.decode(
             pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(),
             'base64'))
     # remove table sink file to clear ineffective files.
     os.remove(write_func)
     return execute_result
Beispiel #5
0
    def __init__(self):
        # self.feature_extractor = DemoFeatureExtractor()
        self.settings = EnvironmentSettings.new_instance().in_streaming_mode(
        ).use_blink_planner().build()
        self.env = StreamExecutionEnvironment.get_execution_environment()
        self.env.set_parallelism(1)
        self.table_env = StreamTableEnvironment.create(
            self.env, environment_settings=self.settings)
        self.table_env.get_config().get_configuration().set_boolean(
            "python.fn-execution.memory.managed", True)
        self.table_env.get_config().get_configuration().set_string(
            "python.fn-execution.buffer.memory.size", "1024mb")
        self.table_env.get_config().get_configuration().set_string(
            "parallelism.default", "3")
        self.table_env.get_config().get_configuration().set_string(
            "python.fn-execution.bundle.size", "5000")
        self.table_env.get_config().get_configuration().set_string(
            "restart-strategy", "fixed-delay")
        self.table_env.get_config().get_configuration().set_string(
            "restart-strategy.fixed-delay.attempts", "3")
        self.table_env.get_config().get_configuration().set_string(
            "restart-strategy.fixed-delay.delay", "30s")

        source_table = open('source.sql', 'r').read()
        sink_table = open('sink.sql', 'r').read()

        self.table_env.execute_sql(source_table)
        self.table_env.execute_sql(sink_table)
Beispiel #6
0
    def test_table_environment_with_blink_planner(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).in_batch_mode().use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/results')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.scan("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        result.insert_into("sink")

        t_env.execute("blink_test")

        results = []
        for root, dirs, files in os.walk(sink_path):
            for sub_file in files:
                with open(os.path.join(root, sub_file), 'r') as f:
                    line = f.readline()
                    while line is not None and line != '':
                        results.append(line)
                        line = f.readline()

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Beispiel #7
0
    def test_table_environment_with_blink_planner(self):
        self.env.set_parallelism(1)
        t_env = StreamTableEnvironment.create(
            self.env,
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/result.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.scan("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        result.insert_into("sink")

        t_env.execute("blink_test")

        results = []
        with open(sink_path, 'r') as f:
            results.append(f.readline())
            results.append(f.readline())

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Beispiel #8
0
 def init_env(self, **kwargs):
     env = StreamExecutionEnvironment.get_execution_environment()
     self.st_env = StreamTableEnvironment.create(
         env,
         environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
     )
     return
def full_outer_join_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")

    result = left.full_outer_join(right, "a = d").select("a, b, e")
    # use custom retract sink connector
    sink = TestRetractSink(
        ["a", "b", "c"],
        [DataTypes.BIGINT(),
         DataTypes.STRING(),
         DataTypes.STRING()])
    st_env.register_table_sink("sink", sink)
    result.insert_into("sink")
    st_env.execute("full outer join streaming")
Beispiel #10
0
def alias_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_alias_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ], result_file))
    orders = st_env.scan("Orders")
    result = orders.alias("x, y, z, t").select("x, y, z, t")
    result.insert_into("result")
    st_env.execute("alias streaming")
    def test_to_configuration(self):

        expected_settings = EnvironmentSettings.new_instance().in_batch_mode(
        ).build()
        config = expected_settings.to_configuration()

        self.assertEqual("BATCH",
                         config.get_string("execution.runtime-mode", "stream"))
Beispiel #12
0
 def setUp(self):
     super(PyFlinkBlinkStreamTableTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.new_instance(
         ).in_streaming_mode().use_blink_planner().build())
Beispiel #13
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    t_env = TableEnvironment.create(environment_settings=env_settings)

    # used to test pipeline.jars and pipleline.classpaths
    config_key = sys.argv[1]
    config_value = sys.argv[2]
    t_env.get_config().get_configuration().set_string(config_key, config_value)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT,
            `count_java` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    t_env.execute_sql(
        "create temporary system function add_one as 'add_one.add_one' language python"
    )
    t_env.register_java_function("add_one_java",
                                 "org.apache.flink.python.tests.util.AddOne")

    elements = [(word, 0) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .select("word, add_one(count) as count, add_one_java(count) as count_java") \
        .group_by("word") \
        .select("word, count(count) as count, count(count_java) as count_java") \
        .execute_insert("Results")
    def test_planner_selection(self):

        gateway = get_gateway()

        CLASS_NAME = gateway.jvm.EnvironmentSettings.CLASS_NAME

        builder = EnvironmentSettings.new_instance()

        OLD_PLANNER_FACTORY = get_private_field(builder._j_builder,
                                                "OLD_PLANNER_FACTORY")
        OLD_EXECUTOR_FACTORY = get_private_field(builder._j_builder,
                                                 "OLD_EXECUTOR_FACTORY")
        BLINK_PLANNER_FACTORY = get_private_field(builder._j_builder,
                                                  "BLINK_PLANNER_FACTORY")
        BLINK_EXECUTOR_FACTORY = get_private_field(builder._j_builder,
                                                   "BLINK_EXECUTOR_FACTORY")

        # test the default behaviour to make sure it is consistent with the python doc
        envrionment_settings = builder.build()

        self.assertEqual(
            envrionment_settings._j_environment_settings.toPlannerProperties()
            [CLASS_NAME], OLD_PLANNER_FACTORY)

        self.assertEqual(
            envrionment_settings._j_environment_settings.toExecutorProperties(
            )[CLASS_NAME], OLD_EXECUTOR_FACTORY)

        # test use_old_planner
        envrionment_settings = builder.use_old_planner().build()

        self.assertEqual(
            envrionment_settings._j_environment_settings.toPlannerProperties()
            [CLASS_NAME], OLD_PLANNER_FACTORY)

        self.assertEqual(
            envrionment_settings._j_environment_settings.toExecutorProperties(
            )[CLASS_NAME], OLD_EXECUTOR_FACTORY)

        # test use_blink_planner
        envrionment_settings = builder.use_blink_planner().build()

        self.assertEqual(
            envrionment_settings._j_environment_settings.toPlannerProperties()
            [CLASS_NAME], BLINK_PLANNER_FACTORY)

        self.assertEqual(
            envrionment_settings._j_environment_settings.toExecutorProperties(
            )[CLASS_NAME], BLINK_EXECUTOR_FACTORY)

        # test use_any_planner
        envrionment_settings = builder.use_any_planner().build()

        self.assertTrue(CLASS_NAME not in envrionment_settings.
                        _j_environment_settings.toPlannerProperties())

        self.assertTrue(CLASS_NAME not in envrionment_settings.
                        _j_environment_settings.toExecutorProperties())
Beispiel #15
0
    def test_blink_from_element(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().in_batch_mode().build())
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(),
            DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(10, 0),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(
                  1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema)
        t.insert_into("Results")
        t_env.execute("test")
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '1970-01-02 00:00:00.0,86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
    def test_to_Configuration(self):

        expected_settings = \
            EnvironmentSettings.new_instance().use_old_planner().in_batch_mode().build()
        config = expected_settings.to_configuration()

        self.assertEqual("OLD", config.get_string("table.planner", "blink"))
        self.assertEqual("BATCH",
                         config.get_string("execution.runtime-mode", "stream"))
    def test_planner_selection(self):

        builder = EnvironmentSettings.new_instance()

        # test the default behaviour to make sure it is consistent with the python doc
        environment_settings = builder.build()

        self.check_blink_planner(environment_settings)

        # test use_blink_planner
        environment_settings = EnvironmentSettings.new_instance(
        ).use_blink_planner().build()

        self.check_blink_planner(environment_settings)

        # test use_any_planner
        environment_settings = builder.use_any_planner().build()

        self.check_any_planner(environment_settings)
Beispiel #18
0
def group_by_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("group_by_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")

    orders = st_env.scan("Orders")
    groub_by_table = orders.group_by("a").select("a, b.sum as d")
    # Because the schema of index user in elasticsearch is
    # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
    # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
    # so we need to cast the type in our demo.
    st_env.register_table("group_table", groub_by_table)
    result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table")
    result.insert_into("result")
    st_env.execute("group by agg streaming")
Beispiel #19
0
 def create_table_env(self):
     exec_env = ExecutionEnvironment.get_execution_environment()
     t_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).in_batch_mode().use_blink_planner().build())
     t_env._j_tenv.getPlanner().getExecEnv().setParallelism(1)
     statement_set = t_env.create_statement_set()
     t_env.get_config().set_python_executable('/usr/bin/python3')
     t_env.get_config().get_configuration().set_boolean(
         "python.fn-execution.memory.managed", True)
     return exec_env, t_env, statement_set
Beispiel #20
0
    def check_blink_planner(self, settings: EnvironmentSettings):
        gateway = get_gateway()
        CLASS_NAME = gateway.jvm.EnvironmentSettings.CLASS_NAME

        builder = EnvironmentSettings.new_instance()
        BLINK_PLANNER_FACTORY = get_private_field(builder._j_builder,
                                                  "BLINK_PLANNER_FACTORY")

        self.assertEqual(
            settings._j_environment_settings.toPlannerProperties()[CLASS_NAME],
            BLINK_PLANNER_FACTORY)
Beispiel #21
0
 def create_table_env(self):
     stream_env = StreamExecutionEnvironment.get_execution_environment()
     stream_env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         stream_env,
         environment_settings=EnvironmentSettings.new_instance(
         ).in_streaming_mode().use_blink_planner().build())
     statement_set = t_env.create_statement_set()
     t_env.get_config().set_python_executable('/usr/bin/python3')
     t_env.get_config().get_configuration().set_boolean(
         "python.fn-execution.memory.managed", True)
     return stream_env, t_env, statement_set
def hello_world():
    """
    从随机Source读取数据,然后直接利用PrintSink输出。
    """
    settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
                    CREATE TABLE random_source (
                        f_sequence INT,
                        f_random INT,
                        f_random_str STRING
                    ) WITH (
                        'connector' = 'datagen',
                        'rows-per-second'='5',
                        'fields.f_sequence.kind'='sequence',
                        'fields.f_sequence.start'='1',
                        'fields.f_sequence.end'='1000',
                        'fields.f_random.min'='1',
                        'fields.f_random.max'='1000',
                        'fields.f_random_str.length'='10'
                    )
                    """

    sink_ddl = """
                  CREATE TABLE print_sink (
                    f_sequence INT,
                    f_random INT,
                    f_random_str STRING 
                ) WITH (
                  'connector' = 'print'
                )
        """

    # 注册source和sink
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    # 注册 UDF
    t_env.register_function('pass_by', pass_by)

    # 数据提取
    tab = t_env.from_path("random_source")
    # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
    tab.select("f_sequence, f_random, pass_by(f_random_str) ").insert_into(
        "print_sink")
    # 执行作业
    t_env.execute("Flink Hello World")
Beispiel #23
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.add_jars("file:///app/src/kafka-clients-2.8.0.jar")
    env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar")
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
    config = env.get_checkpoint_config()
    config.enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION)

    st_env = StreamTableEnvironment.create(
        env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())

    print("register kafka source")
    register_kafka_source(st_env)
    print("register transaction sinks")
    register_transactions_sink_into_csv(st_env)


    st_env.from_path("source_tbl") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("total_sink")

    st_env.from_path("source_tbl") \
       .where("message = 'dolorem'") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""
                   count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("grep_sink")

    st_env.from_path("source_tbl") \
        .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
        .group_by(col("w"), col("message")) \
        .select("""
                    count(message) as total,
                    message,
                    w.end as end_time
                   """) \
        .insert_into("topk_sink")

    st_env.execute("app")
Beispiel #24
0
def load(token):
    # 获取交易日期维度数
    pro = ts.pro_api(token)
    df = pro.query(
        'stock_basic',
        list_status='L',
        fields=
        'ts_code,symbol,name,area,industry,market,curr_type,list_date,is_hs')

    # 创建flink程序的入口
    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    table_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称
    table = table_env.from_pandas(df)
    table_env.create_temporary_view("stock_info", table)
    # 声明输出的
    sink_ddl = """
    -- register a MySQL table 'users' in Flink SQL
    create table Results(
        ts_code STRING,
        symbol STRING,
        name  STRING,
        area   STRING,
        industry  STRING,
        market    STRING,
        curr_type STRING,
        list_date  STRING,
        is_hs  STRING
    ) with (
       'connector' = 'jdbc',
       'url' = 'jdbc:mysql://localhost:3306/shares?useUnicode=yes&characterEncoding=UTF-8&useSSL=false',
       'table-name' = 'dim_stock',
       'username' = 'root',
       'password' = '123456'
    )
    """
    table_env.execute_sql(sink_ddl)

    # 使用jdbc方式需要额外添加java的jar
    table_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar"
    )

    # mini模式运行的时候需要调用wait 等待 程序运行完成
    table_env.execute_sql(
        "insert into Results select * from stock_info").wait()
    def test_add_python_file(self):
        import uuid
        python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4()))
        os.mkdir(python_file_dir)
        python_file_path = os.path.join(python_file_dir, "test_dep1.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_two(a):\n    return a + 2")

        def plus_two_map(value):
            from test_dep1 import add_two
            return add_two(value)

        get_j_env_configuration(self.env._j_stream_execution_environment).\
            setString("taskmanager.numberOfTaskSlots", "10")
        self.env.add_python_file(python_file_path)
        ds = self.env.from_collection([1, 2, 3, 4, 5])
        ds = ds.map(plus_two_map, Types.LONG()) \
               .slot_sharing_group("data_stream") \
               .map(lambda i: i, Types.LONG()) \
               .slot_sharing_group("table")

        python_file_path = os.path.join(python_file_dir, "test_dep2.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_three(a):\n    return a + 3")

        def plus_three(value):
            from test_dep2 import add_three
            return add_three(value)

        t_env = StreamTableEnvironment.create(
            stream_execution_environment=self.env,
            environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
        self.env.add_python_file(python_file_path)

        from pyflink.table.udf import udf
        from pyflink.table.expressions import col
        add_three = udf(plus_three, result_type=DataTypes.BIGINT())

        tab = t_env.from_data_stream(ds, 'a') \
                   .select(add_three(col('a')))
        t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \
             .map(lambda i: i[0]) \
             .add_sink(self.test_sink)
        self.env.execute("test add_python_file")
        result = self.test_sink.get_results(True)
        expected = ['6', '7', '8', '9', '10']
        result.sort()
        expected.sort()
        self.assertEqual(expected, result)
Beispiel #26
0
def kafka_to_mysql():
    """
  从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."}
  cp
  """
    settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
                    CREATE TABLE kafka_source (
                        msg STRING
                    ) WITH (
                        'connector' = 'kafka-0.11',
                        'topic' = 'cdn-log',
                        'properties.bootstrap.servers' = 'kafka:9092',
                        'format' = 'json',
                        'scan.startup.mode' = 'latest-offset'
                    )
                    """

    sink_ddl = """
                  CREATE TABLE mysql_sink (
                    msg STRING 
                ) WITH (
                   'connector' = 'jdbc',
                   'url' = 'jdbc:mysql://mysql:3306/flinkdb?characterEncoding=utf-8&useSSL=false',
                   'table-name' = 'cdn_log',
                   'username' = 'root',
                   'password' = '123456',
                   'sink.buffer-flush.max-rows' = '1'
                )
        """

    # 注册source和sink
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    # 数据提取
    tab = t_env.from_path("kafka_source")
    # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
    tab.insert_into("mysql_sink")
    # 执行作业
    t_env.execute("kafka_to_mysql")
Beispiel #27
0
def test_stream():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)

    # t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 1000000)
    # t_env.get_config().get_configuration().set_boolean("table.exec.mini-batch.enabled", True)
    # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.allow-latency", 1000)
    # t_env.get_config().get_configuration().set_integer("table.exec.mini-batch.size", 100000)
    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000)
    t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True)

    t_env.create_temporary_function("python_avg", MeanAggregateFunction())
    t_env.create_java_temporary_system_function("java_avg", "com.alibaba.flink.function.JavaAvg")

    num_rows = 10000000

    t_env.execute_sql(f"""
        CREATE TABLE source (
            id INT,
            num INT,
            rowtime TIMESTAMP(3),
            WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
        ) WITH (
          'connector' = 'Range',
          'start' = '1',
          'end' = '{num_rows}',
          'step' = '1',
          'partition' = '200'
        )
    """)
    t_env.register_table_sink(
        "sink",
        PrintTableSink(
            ["num", "value"],
            [DataTypes.INT(False), DataTypes.FLOAT(False)], 1000000))
    #         .group_by("num") \
    # .select("num % 1000 as num, id") \
    result = t_env.from_path("source") \
        .select("num % 1000 as num, id") \
        .group_by("num") \
        .select("num, python_avg(id)")
    result.insert_into("sink")
    beg_time = time.time()
    t_env.execute("Python UDF")
    print("PyFlink stream group agg consume time: " + str(time.time() - beg_time))
Beispiel #28
0
 def test_to_append_stream(self):
     self.env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
     table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")], ["a", "b", "c"])
     new_table = table.select("a + 1, b + 'flink', c")
     ds = t_env.to_append_stream(table=new_table, type_info=Types.ROW([Types.LONG(),
                                                                       Types.STRING(),
                                                                       Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.add_sink(test_sink)
     self.env.execute("test_to_append_stream")
     result = test_sink.get_results(False)
     expected = ['+I[2, Hiflink, Hello]', '+I[3, Helloflink, Hi]']
     self.assertEqual(result, expected)
Beispiel #29
0
 def test_to_retract_stream(self):
     self.env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
     table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")], ["a", "b", "c"])
     new_table = table.group_by("c").select("a.sum, c as b")
     ds = t_env.to_retract_stream(table=new_table, type_info=Types.ROW([Types.LONG(),
                                                                        Types.STRING()]))
     test_sink = DataStreamTestSinkFunction()
     ds.map(lambda x: x).add_sink(test_sink)
     self.env.execute("test_to_retract_stream")
     result = test_sink.get_results(True)
     expected = ["(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))",
                 "(True, Row(f0=2, f1='Hello'))"]
     self.assertEqual(result, expected)
Beispiel #30
0
    def __init__(self):
        # self.feature_extractor = DemoFeatureExtractor()
        self.settings = EnvironmentSettings.new_instance().in_streaming_mode(
        ).use_blink_planner().build()
        self.env = StreamExecutionEnvironment.get_execution_environment()
        self.env.set_parallelism(1)
        self.table_env = StreamTableEnvironment.create(
            self.env, environment_settings=self.settings)
        self.table_env.get_config().get_configuration().set_boolean(
            "python.fn-execution.memory.managed", True)
        self.table_env.add_python_file('feature_extractors')

        source_table = open('feature_extractors/source.sql', 'r').read()
        sink_table = open('feature_extractors/sink.sql', 'r').read()

        self.table_env.execute_sql(source_table)
        self.table_env.execute_sql(sink_table)