Beispiel #1
0
def inner_join_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/table_inner_join_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")
    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.join(right).where("a = d").select("a, b, e")
    result.insert_into("result")
    st_env.execute("inner join streaming")
    def add_train_chief_alone_table():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        statement_set = table_env.create_statement_set()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/add.py"
        func = "map_func"
        prop = {}
        prop[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true"
        prop[MLCONSTANTS.PYTHON_VERSION] = "3.7"
        env_path = None
        input_tb = None
        output_schema = None

        tf_config = TFConfig(work_num, ps_num, prop, python_file, func,
                             env_path)

        train(stream_env, table_env, statement_set, input_tb, tf_config,
              output_schema)

        # inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        job_client = statement_set.execute().get_job_client()
        if job_client is not None:
            job_client.get_job_execution_result(
                user_class_loader=None).result()
 def test_execute(self):
     tmp_dir = tempfile.gettempdir()
     field_names = ['a', 'b', 'c']
     field_types = [
         DataTypes.BIGINT(),
         DataTypes.STRING(),
         DataTypes.STRING()
     ]
     t_env = StreamTableEnvironment.create(self.env)
     t_env.register_table_sink(
         'Results',
         CsvTableSink(
             field_names, field_types,
             os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time())))))
     t_env.insert_into(
         'Results',
         t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']))
     execution_result = t_env.execute('test_stream_execute')
     self.assertIsNotNone(execution_result.get_job_id())
     self.assertIsNotNone(execution_result.get_net_runtime())
     self.assertEqual(len(execution_result.get_all_accumulator_results()),
                      0)
     self.assertIsNone(
         execution_result.get_accumulator_result('accumulator'))
     self.assertIsNotNone(str(execution_result))
Beispiel #4
0
    def __init__(self):
        # self.feature_extractor = DemoFeatureExtractor()
        self.settings = EnvironmentSettings.new_instance().in_streaming_mode(
        ).use_blink_planner().build()
        self.env = StreamExecutionEnvironment.get_execution_environment()
        self.env.set_parallelism(1)
        self.table_env = StreamTableEnvironment.create(
            self.env, environment_settings=self.settings)
        self.table_env.get_config().get_configuration().set_boolean(
            "python.fn-execution.memory.managed", True)
        self.table_env.get_config().get_configuration().set_string(
            "python.fn-execution.buffer.memory.size", "1024mb")
        self.table_env.get_config().get_configuration().set_string(
            "parallelism.default", "3")
        self.table_env.get_config().get_configuration().set_string(
            "python.fn-execution.bundle.size", "5000")
        self.table_env.get_config().get_configuration().set_string(
            "restart-strategy", "fixed-delay")
        self.table_env.get_config().get_configuration().set_string(
            "restart-strategy.fixed-delay.attempts", "3")
        self.table_env.get_config().get_configuration().set_string(
            "restart-strategy.fixed-delay.delay", "30s")

        source_table = open('source.sql', 'r').read()
        sink_table = open('sink.sql', 'r').read()

        self.table_env.execute_sql(source_table)
        self.table_env.execute_sql(sink_table)
Beispiel #5
0
def select_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_select_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)

    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "c"],
                     [DataTypes.STRING(), DataTypes.INT()], result_file))
    orders = st_env.scan("Orders")
    result = orders.select("a, b")
    result.insert_into("result")
    st_env.execute("select streaming")
Beispiel #6
0
    def __init__(self,
                 parallelism: int = 1,
                 checkpoint_interval: Optional[int] = None,
                 state_ttl: Optional[int] = None) -> None:
        # setting env
        env = StreamExecutionEnvironment.get_execution_environment()
        env.set_parallelism(parallelism)
        if checkpoint_interval:
            env.set_state_backend(
                RocksDBStateBackend(self.checkpoints_path,
                                    enable_incremental_checkpointing=True))
            env.enable_checkpointing(checkpoint_interval * 1000)
        t_config = TableConfig()
        if state_ttl:
            t_config.set_idle_state_retention_time(
                timedelta(seconds=state_ttl),
                timedelta(seconds=state_ttl + 300))
        table_env = StreamTableEnvironment.create(env, table_config=t_config)
        table_env.get_config().get_configuration().set_string(
            "pipeline.jars", self.flink_sql_connector_kafka_jar)

        # set up table
        for ddl in self.tables:
            table_env.execute_sql(ddl)

        self.env = env
        self.table_env = table_env
Beispiel #7
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)

    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000)
    t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000)
    t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True)

    t_env.register_table_sink(
        "sink",
        PrintTableSink(
            ["id"],
            [DataTypes.INT(False)]))

    @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False))
    def inc(x):
        return x + 1

    t_env.register_function("inc", inc)
    t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc")

    num_rows = 100000000
    t_env.from_table_source(RangeTableSource(1, num_rows, 1)).alias("id") \
        .select("inc(id)") \
        .insert_into("sink")

    beg_time = time.time()
    t_env.execute("Python UDF")
    print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
    def test_table_environment_with_blink_planner(self):
        self.env.set_parallelism(1)
        t_env = StreamTableEnvironment.create(
            self.env,
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/result.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.scan("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        result.insert_into("sink")

        t_env.execute("blink_test")

        results = []
        with open(sink_path, 'r') as f:
            results.append(f.readline())
            results.append(f.readline())

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Beispiel #9
0
def main_flink():
    #之前的步骤是拿文件然后预处理然后写文件到input这个文件中
    env = StreamExecutionEnvironment.get_execution_environment()
    parr_num = 4
    env.set_parallelism(parr_num)
    t_env = StreamTableEnvironment.create(env)
    @udf(input_types=DataTypes.STRING(), result_type=DataTypes.STRING())
    def cut_extract(string):
        return cut_posseg.cut_extract(string)


    t_env.register_function("cut_extract",cut_extract)
    #这里是建表然后从input拿,问题1:有没有办法从自定义的list来当做输入来节省IO开销呢,比如我输入[文本A,文本B]这样的list作为输入
    t_env.connect(FileSystem().path('/home/sjtu/input')) \
        .with_format(OldCsv()
                     .field('text', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('text', DataTypes.STRING())) \
        .create_temporary_table('mySource')

    t_env.connect(FileSystem().path('/home/sjtu/output')) \
        .with_format(OldCsv()
                     .field('result', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('result', DataTypes.STRING())) \
        .create_temporary_table('mySink')

    t_env.from_path('mySource')\
        .select("cut_extract(text)")\
        .insert_into('mySink')
    #问题2:这里我是将结果的表写了文件,但实际上我还需要在这个代码中继续对这些处理完的数据进行处理,也没有办法直接将上述的mySink表直接
    #作为内存数据取出来而不是从硬盘再读入呢
    t_env.execute("tutorial_job")
Beispiel #10
0
 def init_env(self, **kwargs):
     env = StreamExecutionEnvironment.get_execution_environment()
     self.st_env = StreamTableEnvironment.create(
         env,
         environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
     )
     return
def train(stream_env=None,
          table_env=None,
          statement_set=None,
          input_table=None,
          tf_config=None,
          output_schema=None):
    """
    Run TF train for flink table api.

    :param stream_env: The StreamExecutionEnvironment. If it's None, this method will create one and execute the job
                       at the end. Otherwise, caller is responsible to trigger the job execution
    :param table_env: The Flink TableEnvironment.
    :param statement_set: The StatementSet created by the given TableEnvironment.
    :param input_table: The input Table.
    :param tf_config: Configurations for the TF program.
    :param output_schema: The TableSchema for the output Table. If it's null, a dummy sink will be connected.
    :return: output Table. Otherwise, caller is responsible to add sink to the output Table before executing the graph.
    """

    if stream_env is None:
        stream_env = StreamExecutionEnvironment.get_execution_environment()
    if table_env is None:
        table_env = StreamTableEnvironment.create(stream_env)
    if statement_set is None:
        statement_set = table_env.create_statement_set()
    if input_table is not None:
        input_table = input_table._j_table
    if output_schema is not None:
        output_schema = output_schema._j_table_schema
    output_table = get_gateway(
    ).jvm.org.flinkextended.flink.ml.tensorflow.client.TFUtils.train(
        stream_env._j_stream_execution_environment,
        table_env._j_tenv, statement_set._j_statement_set, input_table,
        tf_config.java_config(), output_schema)
    return Table(output_table, table_env)
Beispiel #12
0
    def test_custom_env(self):
        import pyflink
        from pyflink.dataset import ExecutionEnvironment
        from pyflink.datastream import StreamExecutionEnvironment
        benv = ExecutionEnvironment.get_execution_environment()
        senv = StreamExecutionEnvironment.get_execution_environment()

        from pyflink.table import BatchTableEnvironment
        from pyflink.table import StreamTableEnvironment

        btenv = BatchTableEnvironment.create(benv)
        stenv = StreamTableEnvironment.create(senv)

        mlenv = useCustomEnv(pyflink.java_gateway.get_gateway(),
                             benv, btenv, senv, stenv)

        t = mlenv.btenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        source = TableSourceBatchOp(t)
        source.print()

        t = mlenv.stenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        source = TableSourceStreamOp(t)
        source.print()
        StreamOperator.execute()

        from pyalink.alink import env
        env._in_custom_env = False
        resetEnv()
def full_outer_join_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")

    result = left.full_outer_join(right, "a = d").select("a, b, e")
    # use custom retract sink connector
    sink = TestRetractSink(
        ["a", "b", "c"],
        [DataTypes.BIGINT(),
         DataTypes.STRING(),
         DataTypes.STRING()])
    st_env.register_table_sink("sink", sink)
    result.insert_into("sink")
    st_env.execute("full outer join streaming")
    def input_output_table():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        statement_set = table_env.create_statement_set()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/input_output.py"
        prop = {}
        func = "map_func"
        env_path = None
        prop[
            MLCONSTANTS.
            ENCODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding"
        prop[
            MLCONSTANTS.
            DECODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding"
        inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING"
        prop["sys:csv_encode_types"] = inputSb
        prop["sys:csv_decode_types"] = inputSb
        prop[MLCONSTANTS.PYTHON_VERSION] = "3.7"
        source_file = os.getcwd() + "/../../src/test/resources/input.csv"
        sink_file = os.getcwd() + "/../../src/test/resources/output.csv"
        table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING()
        ])
        table_env.register_table_source("source", table_source)
        input_tb = table_env.from_path("source")
        output_schema = TableSchema(["a", "b", "c", "d", "e"], [
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING()
        ])
        sink = CsvTableSink(["a", "b", "c", "d", "e"], [
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING()
        ],
                            sink_file,
                            write_mode=WriteMode.OVERWRITE)
        table_env.register_table_sink("table_row_sink", sink)
        tf_config = TFConfig(work_num, ps_num, prop, python_file, func,
                             env_path)
        output_table = train(stream_env, table_env, statement_set, input_tb,
                             tf_config, output_schema)

        # output_table = inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        statement_set.add_insert("table_row_sink", output_table)
        job_client = statement_set.execute().get_job_client()
        if job_client is not None:
            job_client.get_job_execution_result(
                user_class_loader=None).result()
Beispiel #15
0
def filter_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_filter_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ], result_file))
    orders = st_env.scan("Orders")
    result = orders.filter("b % 2 === 0")
    result.insert_into("result")
    st_env.execute("filter streaming")
def log_processing():
    env = StreamExecutionEnvironment.get_execution_environment()
    env_settings = EnvironmentSettings.Builder().use_blink_planner().build()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=env_settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
            CREATE TABLE payment_msg(
                createTime VARCHAR,
                orderId BIGINT,
                payAmount DOUBLE,
                payPlatform INT,
                provinceId INT
            ) WITH (
              'connector.type' = 'kafka',
              'connector.version' = 'universal',
              'connector.topic' = 'payment_msg',
              'connector.properties.bootstrap.servers' = 'kafka:9092',
              'connector.properties.group.id' = 'test_3',
              'connector.startup-mode' = 'latest-offset',
              'format.type' = 'json'
            )
            """

    es_sink_ddl = """
            CREATE TABLE es_sink (
            province VARCHAR PRIMARY KEY,
            pay_amount DOUBLE
            ) with (
                'connector.type' = 'elasticsearch',
                'connector.version' = '7',
                'connector.hosts' = 'http://elasticsearch:9200',
                'connector.index' = 'platform_pay_amount_1',
                'connector.document-type' = 'payment',
                'update-mode' = 'upsert',
                'connector.flush-on-checkpoint' = 'true',
                'connector.key-delimiter' = '$',
                'connector.key-null-literal' = 'n/a',
                'connector.bulk-flush.max-size' = '42mb',
                'connector.bulk-flush.max-actions' = '32',
                'connector.bulk-flush.interval' = '1000',
                'connector.bulk-flush.backoff.delay' = '1000',
                'format.type' = 'json'
            )
    """

    t_env.sql_update(source_ddl)
    t_env.sql_update(es_sink_ddl)
    t_env.register_function('province_id_to_name', province_id_to_name)

    t_env.from_path("payment_msg") \
        .select("province_id_to_name(provinceId) as province, payAmount") \
        .group_by("province") \
        .select("province, sum(payAmount) as pay_amount") \
        .insert_into("es_sink")

    t_env.execute("payment_demo")
Beispiel #17
0
 def setUp(self):
     super(PyFlinkBlinkStreamTableTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.new_instance(
         ).in_streaming_mode().use_blink_planner().build())
Beispiel #18
0
 def create_env(
         self) -> (ExecutionEnvironment, TableEnvironment, StatementSet):
     exec_env = StreamExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = StreamTableEnvironment.create(exec_env, t_config)
     t_env.get_config().get_configuration().set_string(
         "taskmanager.memory.task.off-heap.size", '80m')
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
Beispiel #19
0
    def test_create_table_environment_with_blink_planner(self):
        t_env = StreamTableEnvironment.create(
            self.env,
            environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())

        planner = t_env._j_tenv.getPlanner()

        self.assertEqual(
            planner.getClass().getName(),
            "org.apache.flink.table.planner.delegation.StreamPlanner")
Beispiel #20
0
def group_by_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("group_by_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")

    orders = st_env.scan("Orders")
    groub_by_table = orders.group_by("a").select("a, b.sum as d")
    # Because the schema of index user in elasticsearch is
    # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
    # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
    # so we need to cast the type in our demo.
    st_env.register_table("group_table", groub_by_table)
    result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table")
    result.insert_into("result")
    st_env.execute("group by agg streaming")
Beispiel #21
0
    def test_construct_with_stream_env(self):
        stream_execution_environment = StreamExecutionEnvironment.get_execution_environment()
        stream_table_environment = StreamTableEnvironment.create(stream_execution_environment)

        ml_environment = MLEnvironment(
            stream_exe_env=stream_execution_environment,
            stream_tab_env=stream_table_environment)
        self.assertEqual(
            ml_environment.get_stream_execution_environment(),
            stream_execution_environment)
        self.assertEqual(ml_environment.get_stream_table_environment(), stream_table_environment)
Beispiel #22
0
    def get_stream_table_environment(self) -> StreamTableEnvironment:
        """
        Get the StreamTableEnvironment. If the StreamTableEnvironment has not been set,
        it initial the StreamTableEnvironment with default Configuration.

        :return: the StreamTableEnvironment.
        """
        if self._stream_tab_env is None:
            self._stream_tab_env = StreamTableEnvironment.create(
                StreamExecutionEnvironment.get_execution_environment())
        return self._stream_tab_env
Beispiel #23
0
    def setUp(self):
        self.env = StreamExecutionEnvironment.get_execution_environment()
        self._load_dependency_jars()
        config = Configuration(
            j_configuration=get_j_env_configuration(self.env._j_stream_execution_environment))
        config.set_boolean("execution.checkpointing.checkpoints-after-tasks-finish.enabled", True)

        self.env.set_parallelism(4)
        self.env.enable_checkpointing(100)
        self.env.set_restart_strategy(RestartStrategies.no_restart())
        self.t_env = StreamTableEnvironment.create(self.env)
        self.temp_dir = tempfile.mkdtemp()
Beispiel #24
0
    def setUp(self) -> None:
        from pyflink.datastream import StreamExecutionEnvironment

        super(DataStreamConversionTestCases, self).setUp()
        config = Configuration()
        config.set_string("akka.ask.timeout", "20 s")
        self.env = StreamExecutionEnvironment.get_execution_environment(config)
        self.t_env = StreamTableEnvironment.create(self.env)

        self.env.set_parallelism(2)
        self.t_env.get_config().set("python.fn-execution.bundle.size", "1")
        self.test_sink = DataStreamTestSinkFunction()
Beispiel #25
0
def pandas_udaf():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    @udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
    def mean_udaf(v):
        return v.mean()

    # define the tumble window operation
    table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, mean_udaf(table.price), col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Beispiel #26
0
 def create_table_env(self):
     stream_env = StreamExecutionEnvironment.get_execution_environment()
     stream_env.set_parallelism(1)
     t_env = StreamTableEnvironment.create(
         stream_env,
         environment_settings=EnvironmentSettings.new_instance(
         ).in_streaming_mode().use_blink_planner().build())
     statement_set = t_env.create_statement_set()
     t_env.get_config().set_python_executable('/usr/bin/python3')
     t_env.get_config().get_configuration().set_boolean(
         "python.fn-execution.memory.managed", True)
     return stream_env, t_env, statement_set
def hello_world():
    """
    从随机Source读取数据,然后直接利用PrintSink输出。
    """
    settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env,
                                          environment_settings=settings)
    t_env.get_config().get_configuration().set_boolean(
        "python.fn-execution.memory.managed", True)

    source_ddl = """
                    CREATE TABLE random_source (
                        f_sequence INT,
                        f_random INT,
                        f_random_str STRING
                    ) WITH (
                        'connector' = 'datagen',
                        'rows-per-second'='5',
                        'fields.f_sequence.kind'='sequence',
                        'fields.f_sequence.start'='1',
                        'fields.f_sequence.end'='1000',
                        'fields.f_random.min'='1',
                        'fields.f_random.max'='1000',
                        'fields.f_random_str.length'='10'
                    )
                    """

    sink_ddl = """
                  CREATE TABLE print_sink (
                    f_sequence INT,
                    f_random INT,
                    f_random_str STRING 
                ) WITH (
                  'connector' = 'print'
                )
        """

    # 注册source和sink
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    # 注册 UDF
    t_env.register_function('pass_by', pass_by)

    # 数据提取
    tab = t_env.from_path("random_source")
    # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
    tab.select("f_sequence, f_random, pass_by(f_random_str) ").insert_into(
        "print_sink")
    # 执行作业
    t_env.execute("Flink Hello World")
Beispiel #28
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.add_jars("file:///app/src/kafka-clients-2.8.0.jar")
    env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar")
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
    config = env.get_checkpoint_config()
    config.enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION)

    st_env = StreamTableEnvironment.create(
        env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())

    print("register kafka source")
    register_kafka_source(st_env)
    print("register transaction sinks")
    register_transactions_sink_into_csv(st_env)


    st_env.from_path("source_tbl") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("total_sink")

    st_env.from_path("source_tbl") \
       .where("message = 'dolorem'") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""
                   count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("grep_sink")

    st_env.from_path("source_tbl") \
        .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
        .group_by(col("w"), col("message")) \
        .select("""
                    count(message) as total,
                    message,
                    w.end as end_time
                   """) \
        .insert_into("topk_sink")

    st_env.execute("app")
    def test_add_python_file(self):
        import uuid
        env = self.env
        python_file_dir = os.path.join(self.tempdir,
                                       "python_file_dir_" + str(uuid.uuid4()))
        os.mkdir(python_file_dir)
        python_file_path = os.path.join(python_file_dir, "test_dep1.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_two(a):\n    return a + 2")

        def plus_two_map(value):
            from test_dep1 import add_two
            return add_two(value)

        get_j_env_configuration(env._j_stream_execution_environment).\
            setString("taskmanager.numberOfTaskSlots", "10")
        env.add_python_file(python_file_path)
        ds = env.from_collection([1, 2, 3, 4, 5])
        ds = ds.map(plus_two_map, Types.LONG()) \
               .slot_sharing_group("data_stream") \
               .map(lambda i: i, Types.LONG()) \
               .slot_sharing_group("table")

        python_file_path = os.path.join(python_file_dir, "test_dep2.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_three(a):\n    return a + 3")

        def plus_three(value):
            from test_dep2 import add_three
            return add_three(value)

        t_env = StreamTableEnvironment.create(
            stream_execution_environment=env,
            environment_settings=EnvironmentSettings.in_streaming_mode())
        env.add_python_file(python_file_path)

        from pyflink.table.udf import udf
        from pyflink.table.expressions import col
        add_three = udf(plus_three, result_type=DataTypes.BIGINT())

        tab = t_env.from_data_stream(ds, col('a')) \
                   .select(add_three(col('a')))
        t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \
             .map(lambda i: i[0]) \
             .add_sink(self.test_sink)
        env.execute("test add_python_file")
        result = self.test_sink.get_results(True)
        expected = ['6', '7', '8', '9', '10']
        result.sort()
        expected.sort()
        self.assertEqual(expected, result)
Beispiel #30
0
def tumble_window_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts", "name", "price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .build())
                       .build())

    # define the over window operation
    table = table.over_window(
        Over.partition_by(col("name"))
            .order_by(col("ts"))
            .preceding(row_interval(2))
            .following(CURRENT_ROW)
            .alias('w')) \
        .select(table.name, table.price.max.over(col('w')))

    # submit for execution
    table.execute_insert('sink') \
         .wait()