コード例 #1
0
def table_func_python_sql_join_lateral_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a aa aaa", "aa"),
                                         ("b bb bbb", "bb"),
                                         ("c cc ccc", "cc")],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/table_func_python_sql_join_lateral_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c"],
                     [DataTypes.STRING(),
                      DataTypes.STRING(),
                      DataTypes.INT()], result_file))

    bt_env.register_java_function("split", "com.pyflink.table.Split")
    bt_env.register_table("MyTable", source_table)

    result = bt_env.sql_query(
        "SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)"
    )

    result.insert_into("result")

    bt_env.execute("table func python sql join lateral api")
コード例 #2
0
def group_by_agg():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_agg.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source("Orders",
                                 CsvTableSource(source_file,
                                                ["a", "b", "c", "rowtime"],
                                                [DataTypes.STRING(),
                                                 DataTypes.INT(),
                                                 DataTypes.INT(),
                                                 DataTypes.TIMESTAMP()]))
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.INT()],
                                            result_file))
    orders = bt_env.scan("Orders")
    result = orders.group_by("a").select("a, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg")

    with open(result_file, 'r') as f:
        print(f.read())
コード例 #3
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # used to test pipeline.jars and pipleline.classpaths
    config_key = sys.argv[1]
    config_value = sys.argv[2]
    t_env.get_config().get_configuration().set_string(config_key, config_value)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT,
            `count_java` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.sql_update(sink_ddl)

    t_env.sql_update(
        "create temporary system function add_one as 'add_one.add_one' language python"
    )
    t_env.register_java_function("add_one_java",
                                 "org.apache.flink.python.tests.util.AddOne")

    elements = [(word, 0) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .select("word, add_one(count) as count, add_one_java(count) as count_java") \
        .group_by("word") \
        .select("word, count(count) as count, count(count_java) as count_java") \
        .insert_into("Results")

    t_env.execute("word_count")
コード例 #4
0
ファイル: 03union.py プロジェクト: xueyifeiyun/enjoyment.code
def union():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = os.getcwd() + "/tmp/table_union_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "1b", "1bb"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (1, "1a", "1laa"),
                                 (1, "1b", "1bb")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["a", "b", "c"]).select("a, b, c")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.union(right)
    #result = left.union_all(right)
    result.insert_into("result")
    bt_env.execute("union")

    with open(result_file, 'r') as f:
        print(f.read())
コード例 #5
0
ファイル: minus.py プロジェクト: toby1991/pyflink-demo
def minus_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = "/tmp/table_minus_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"),
                                 (3, "", "lcc"), (2, "lb", "lbb"),
                                 (1, "ra", "raa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"),
                                  (3, "rc", "rcc"), (1, "ra", "raa")],
                                 ["a", "b", "c"]).select("a, b, c")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.minus(right)
    result.insert_into("result")
    bt_env.execute("minus batch")
コード例 #6
0
def group_by_window_agg_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_window_agg_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "start", "end", "rowtime", "d"], [
            DataTypes.STRING(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.INT()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("a, w") \
        .select("a, w.start, w.end, w.rowtime, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg batch")
コード例 #7
0
def scalar_func_python_table_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a", "aa"), ("b", "bb"),
                                         ("c", "cc")],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/scalar_func_python_table_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c"],
                     [DataTypes.STRING(),
                      DataTypes.INT(),
                      DataTypes.INT()], result_file))

    # register the java scalar function
    bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode")

    # use the java scalar function in Python Table API
    result = source_table.select("a, a.hashCode(), hashCode(a)")
    result.insert_into("result")
    bt_env.execute("scalar func python table api")
コード例 #8
0
def inner_join():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = "/tmp/table_inner_join.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements(
        [(1, "1a", "1laa"),
         (2, "2a", "2aa"),
         (3, None, "3aa"),
         (2, "4b", "4bb"),
         (5, "5a", "5aa")],
        ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([
        (1, "1b", "1bb"),
        (2, None, "2bb"),
        (1, "3b", "3bb"),
        (4, "4b", "4bb")],
        ["d", "e", "f"]).select("d, e, f")
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file))

    result = left.join(right).where("a = d").select("a, b, e")
    result.insert_into("result")
    bt_env.execute("inner join")

    with open(result_file, 'r') as f:
        print(f.read())
コード例 #9
0
ファイル: test_env.py プロジェクト: wwjiang007/Alink
    def test_custom_env(self):
        import pyflink
        from pyflink.dataset import ExecutionEnvironment
        from pyflink.datastream import StreamExecutionEnvironment
        benv = ExecutionEnvironment.get_execution_environment()
        senv = StreamExecutionEnvironment.get_execution_environment()

        from pyflink.table import BatchTableEnvironment
        from pyflink.table import StreamTableEnvironment

        btenv = BatchTableEnvironment.create(benv)
        stenv = StreamTableEnvironment.create(senv)

        mlenv = useCustomEnv(pyflink.java_gateway.get_gateway(),
                             benv, btenv, senv, stenv)

        t = mlenv.btenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        source = TableSourceBatchOp(t)
        source.print()

        t = mlenv.stenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        source = TableSourceStreamOp(t)
        source.print()
        StreamOperator.execute()

        from pyalink.alink import env
        env._in_custom_env = False
        resetEnv()
コード例 #10
0
 def create_table_env(self):
     exec_env = ExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = BatchTableEnvironment.create(exec_env, t_config)
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
コード例 #11
0
def demo02():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input'
        )
    """

    my_sink_ddl = """
        create table mySink (
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output'
        )
    """

    t_env.execute_sql(my_source_ddl)
    t_env.execute_sql(my_sink_ddl)

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
コード例 #12
0
def demo01():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \
        .with_format(OldCsv()
                     .field('word', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())) \
        .create_temporary_table('mySource')
    # 文件存在会报错
    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \
        .with_format(OldCsv()
                     .field_delimiter('\t')
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
コード例 #13
0
def scalar_func_python_sql():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a", 1), ("b", 2), ("c", 3)],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/scalar_func_python_sql.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b"],
                     [DataTypes.STRING(), DataTypes.INT()], result_file))

    # register the java scalar function
    bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode")

    # register the table for using in the sql query
    bt_env.register_table("MyTable", source_table)

    result = bt_env.sql_query("SELECT a, hashCode(a) FROM MyTable")
    result.insert_into("result")
    bt_env.execute("scalar func python sql")
コード例 #14
0
def select_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_select_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)

    bt_env.register_table_source("Orders",
                                 CsvTableSource(source_file,
                                                ["a", "b", "c", "rowtime"],
                                                [DataTypes.STRING(),
                                                 DataTypes.INT(),
                                                 DataTypes.INT(),
                                                 DataTypes.TIMESTAMP()]))
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "c"],
                                            [DataTypes.STRING(),
                                             DataTypes.INT()],
                                            result_file))
    orders = bt_env.scan("Orders")
    result = orders.select("a, b")
    result.insert_into("result")
    bt_env.execute("select batch")
コード例 #15
0
ファイル: test_ml_environment.py プロジェクト: XSanC/FLink
    def test_construct_with_batch_env(self):
        execution_environment = ExecutionEnvironment.get_execution_environment()
        batch_table_environment = BatchTableEnvironment.create(execution_environment)

        ml_environment = MLEnvironment(
            exe_env=execution_environment,
            batch_tab_env=batch_table_environment)
        self.assertEqual(ml_environment.get_execution_environment(), execution_environment)
        self.assertEqual(ml_environment.get_batch_table_environment(), batch_table_environment)
コード例 #16
0
ファイル: ml_environment.py プロジェクト: Temitope-A/cog
    def get_execution_environment(self) -> ExecutionEnvironment:
        """
        Get the ExecutionEnvironment. If the ExecutionEnvironment has not been set,
        it initial the ExecutionEnvironment with default Configuration.

        :return: the batch ExecutionEnvironment.
        """
        if self._exe_env is None:
            self._exe_env = ExecutionEnvironment.get_execution_environment()
        return self._exe_env
コード例 #17
0
 def create_env(
         self) -> (ExecutionEnvironment, TableEnvironment, StatementSet):
     exec_env = ExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = BatchTableEnvironment.create(exec_env, t_config)
     t_env.get_config().get_configuration().set_string(
         "taskmanager.memory.task.off-heap.size", '80m')
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
コード例 #18
0
ファイル: ml_environment.py プロジェクト: Temitope-A/cog
    def get_batch_table_environment(self) -> BatchTableEnvironment:
        """
        Get the BatchTableEnvironment. If the BatchTableEnvironment has not been set,
        it initial the BatchTableEnvironment with default Configuration.

        :return: the BatchTableEnvironment.
        """
        if self._batch_tab_env is None:
            self._batch_tab_env = BatchTableEnvironment.create(
                ExecutionEnvironment.get_execution_environment())
        return self._batch_tab_env
コード例 #19
0
 def create_table_env(self):
     exec_env = ExecutionEnvironment.get_execution_environment()
     t_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).in_batch_mode().use_blink_planner().build())
     t_env._j_tenv.getPlanner().getExecEnv().setParallelism(1)
     statement_set = t_env.create_statement_set()
     t_env.get_config().set_python_executable('/usr/bin/python3')
     t_env.get_config().get_configuration().set_boolean(
         "python.fn-execution.memory.managed", True)
     return exec_env, t_env, statement_set
コード例 #20
0
    def test_equals_and_hash(self):

        config1 = ExecutionEnvironment.get_execution_environment().get_config()

        config2 = ExecutionEnvironment.get_execution_environment().get_config()

        self.assertEqual(config1, config2)

        self.assertEqual(hash(config1), hash(config2))

        config1.set_parallelism(12)

        self.assertNotEqual(config1, config2)

        self.assertNotEqual(hash(config1), hash(config2))

        config2.set_parallelism(12)

        self.assertEqual(config1, config2)

        self.assertEqual(hash(config1), hash(config2))
コード例 #21
0
ファイル: word_count.py プロジェクト: Bhola-B2C/PyFlink
def word_count():
    result = wikipedia.page("New York City")
    content = result.summary


    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)
    print(add.add(10,5))
    print("Word Count");
    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    #sink_ddl = """
    #    create table Results(
    #        word VARCHAR,
    #        `count` BIGINT
    #    ) with (
    #        'connector.type' = 'filesystem',
    #        'format.type' = 'csv',
    #        'connector.path' = '{}'
    #   )
    #    """.format(result_path)
    t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('Results')
    #t_env.sql_update(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
コード例 #22
0
ファイル: env.py プロジェクト: wwjiang007/Alink
def usePyFlinkEnv(parallelism: int = None, flinkHome: str = None) -> MLEnv:
    global _mlenv
    if in_custom_env():
        print(
            "Warning: usePyFlinkEnv will do nothing, since useCustomEnv is used to initialize MLEnv."
        )
        return _mlenv

    resetEnv()
    if flinkHome is not None:
        g_config["flink_home"] = flinkHome

    # Let PyFlink to launch gateway, and warn users to add jars to pyflink lib path
    print(
        "Warning: You're running the script with 'getMLEnv'. "
        "You have to manually add Alink jars to PyFlink lib path to make the script work."
    )
    import pyflink
    # noinspection PyUnresolvedReferences
    gateway = pyflink.java_gateway.get_gateway()
    # noinspection PyUnresolvedReferences
    pyflink.java_gateway.import_flink_view(gateway)

    # In PyFlink 1.9 and 1.10, PyFlink doesn't start callback server.
    # We start callback server manually.
    success = gateway.start_callback_server(
        callback_server_parameters=CallbackServerParameters(
            port=0, daemonize=True, daemonize_connections=True))
    if success:
        callback_server_port = gateway.get_callback_server(
        ).get_listening_port()
        gateway.java_gateway_server.resetCallbackClient(
            gateway.java_gateway_server.getCallbackClient().getAddress(),
            callback_server_port)

    set_java_gateway(gateway)

    from pyflink.dataset import ExecutionEnvironment
    from pyflink.datastream import StreamExecutionEnvironment

    benv = ExecutionEnvironment.get_execution_environment()
    senv = StreamExecutionEnvironment.get_execution_environment()
    if parallelism is not None:
        benv.set_parallelism(parallelism)
        senv.set_parallelism(parallelism)

    # noinspection PyProtectedMember
    _mlenv = setup_py_ml_env(gateway, benv._j_execution_environment,
                             senv._j_stream_execution_environment)
    return _mlenv
コード例 #23
0
    def test_create_table_environment(self):
        table_config = TableConfig()
        table_config.set_max_generated_code_length(32000)
        table_config.set_null_check(False)
        table_config.set_local_timezone("Asia/Shanghai")

        env = ExecutionEnvironment.get_execution_environment()
        t_env = BatchTableEnvironment.create(env, table_config)

        readed_table_config = t_env.get_config()

        self.assertFalse(readed_table_config.get_null_check())
        self.assertEqual(readed_table_config.get_max_generated_code_length(), 32000)
        self.assertEqual(readed_table_config.get_local_timezone(), "Asia/Shanghai")
コード例 #24
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    t_env.register_java_function("len", "org.apache.flink.udf.UDFLength")
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, len(word), count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
コード例 #25
0
ファイル: word_count.py プロジェクト: wupengbo125/penter
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    #t_config.set_python_executable("/opt/python38/bin/python3")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])


    table.group_by(table.word) \
        .select(table.word, expr.lit(1).count.alias('count')) \
        .execute_insert("Results").wait()
コード例 #26
0
def offset_and_fetch_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv"
    result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv"
    result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv"
    if os.path.exists(result_file_1):
        os.remove(result_file_1)
    if os.path.exists(result_file_2):
        os.remove(result_file_2)
    if os.path.exists(result_file_3):
        os.remove(result_file_3)

    bt_env.register_table_sink("result1",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_1))

    bt_env.register_table_sink("result2",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_2))

    bt_env.register_table_sink("result3",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_3))

    left = bt_env.from_elements(
        [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
        ["a", "b", "c"]).select("a, b, c")

    ordered_table = left.order_by("a.asc")

    ordered_table.fetch(5).insert_into("result1")
    ordered_table.offset(1).insert_into("result2")
    ordered_table.offset(1).fetch(2).insert_into("result3")

    bt_env.execute("offset and fetch batch")
コード例 #27
0
def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")
コード例 #28
0
def word_count():
    # declare a table environment, set configurations.
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    output_file = os.path.abspath('.') + '/out.txt'
    if os.path.exists(output_file):
        try:
            if os.path.isfile(output_file):
                os.remove(output_file)
        except OSError as e:
            print("Error", e.filename, e.strerror)
    print("Results:", output_file)

    sink_ddl = """
            create table Results(
                word VARCHAR,
                `count` BIGINT
            ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
            )
        """.format(output_file)
    t_env.sql_update(sink_ddl)

    # create the source table with a single string
    # preforms some transformations, and writes the results to table Results
    content = "Who's there? I think I hear them. Stand, ho! Who's there?"
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    # execute the Flink Python Table API job
    t_env.execute("word_count")
コード例 #29
0
def aggregate_func_python_table_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2),
                                         ("a", 5, 2)],
                                        ["user", "points", "level"])

    result_file = "/tmp/aggregate_func_python_table_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.BIGINT()], result_file))
    bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg")
    result = source_table.group_by("user").select(
        "user, wAvg(points, level) as avgPoints")
    result.insert_into("result")
    bt_env.execute("aggregate func python table api")
コード例 #30
0
class main():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)

    # t_env.connect(FileSystem().path('./temp/deviceorientation')) \
    #     .with_format(OldCsv()
    #                 .field('word', DataTypes.STRING())) \
    #     .with_schema(Schema()
    #                 .field('word', DataTypes.STRING())) \
    #     .create_temporary_table('mySource')
    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = './temp/input'
        )
    """
    t_env.sql_update(my_source_ddl)

    t_env.connect(FileSystem().path('/tmp/output')) \
        .with_format(OldCsv()
                    .field_delimiter('\t')
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                    .field('word', DataTypes.STRING())
                    .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')
    t_env.from_path('mySource') \
        .group_by('word') \
        .select('word, count(1)') \
        .insert_into('mySink')

    t_env.execute("tutorial_job")