Esempio n. 1
0
def select_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_select_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)

    bt_env.register_table_source("Orders",
                                 CsvTableSource(source_file,
                                                ["a", "b", "c", "rowtime"],
                                                [DataTypes.STRING(),
                                                 DataTypes.INT(),
                                                 DataTypes.INT(),
                                                 DataTypes.TIMESTAMP()]))
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "c"],
                                            [DataTypes.STRING(),
                                             DataTypes.INT()],
                                            result_file))
    orders = bt_env.scan("Orders")
    result = orders.select("a, b")
    result.insert_into("result")
    bt_env.execute("select batch")
 def test_execute(self):
     tmp_dir = tempfile.gettempdir()
     field_names = ['a', 'b', 'c']
     field_types = [
         DataTypes.BIGINT(),
         DataTypes.STRING(),
         DataTypes.STRING()
     ]
     t_env = BatchTableEnvironment.create(self.env)
     t_env.register_table_sink(
         'Results',
         CsvTableSink(
             field_names, field_types,
             os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time())))))
     t_env.insert_into(
         'Results',
         t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']))
     execution_result = t_env.execute('test_batch_execute')
     self.assertIsNotNone(execution_result.get_job_id())
     self.assertIsNotNone(execution_result.get_net_runtime())
     self.assertEqual(len(execution_result.get_all_accumulator_results()),
                      0)
     self.assertIsNone(
         execution_result.get_accumulator_result('accumulator'))
     self.assertIsNotNone(str(execution_result))
Esempio n. 3
0
def minus_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = "/tmp/table_minus_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"),
                                 (3, "", "lcc"), (2, "lb", "lbb"),
                                 (1, "ra", "raa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"),
                                  (3, "rc", "rcc"), (1, "ra", "raa")],
                                 ["a", "b", "c"]).select("a, b, c")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.minus(right)
    result.insert_into("result")
    bt_env.execute("minus batch")
Esempio n. 4
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # used to test pipeline.jars and pipleline.classpaths
    config_key = sys.argv[1]
    config_value = sys.argv[2]
    t_env.get_config().get_configuration().set_string(config_key, config_value)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT,
            `count_java` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.sql_update(sink_ddl)

    t_env.sql_update(
        "create temporary system function add_one as 'add_one.add_one' language python"
    )
    t_env.register_java_function("add_one_java",
                                 "org.apache.flink.python.tests.util.AddOne")

    elements = [(word, 0) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .select("word, add_one(count) as count, add_one_java(count) as count_java") \
        .group_by("word") \
        .select("word, count(count) as count, count(count_java) as count_java") \
        .insert_into("Results")

    t_env.execute("word_count")
Esempio n. 5
0
def inner_join():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = "/tmp/table_inner_join.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements(
        [(1, "1a", "1laa"),
         (2, "2a", "2aa"),
         (3, None, "3aa"),
         (2, "4b", "4bb"),
         (5, "5a", "5aa")],
        ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([
        (1, "1b", "1bb"),
        (2, None, "2bb"),
        (1, "3b", "3bb"),
        (4, "4b", "4bb")],
        ["d", "e", "f"]).select("d, e, f")
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file))

    result = left.join(right).where("a = d").select("a, b, e")
    result.insert_into("result")
    bt_env.execute("inner join")

    with open(result_file, 'r') as f:
        print(f.read())
def group_by_agg():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_agg.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source("Orders",
                                 CsvTableSource(source_file,
                                                ["a", "b", "c", "rowtime"],
                                                [DataTypes.STRING(),
                                                 DataTypes.INT(),
                                                 DataTypes.INT(),
                                                 DataTypes.TIMESTAMP()]))
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.INT()],
                                            result_file))
    orders = bt_env.scan("Orders")
    result = orders.group_by("a").select("a, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg")

    with open(result_file, 'r') as f:
        print(f.read())
Esempio n. 7
0
def scalar_func_python_sql():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a", 1), ("b", 2), ("c", 3)],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/scalar_func_python_sql.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b"],
                     [DataTypes.STRING(), DataTypes.INT()], result_file))

    # register the java scalar function
    bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode")

    # register the table for using in the sql query
    bt_env.register_table("MyTable", source_table)

    result = bt_env.sql_query("SELECT a, hashCode(a) FROM MyTable")
    result.insert_into("result")
    bt_env.execute("scalar func python sql")
def group_by_window_agg_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_window_agg_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "start", "end", "rowtime", "d"], [
            DataTypes.STRING(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.INT()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("a, w") \
        .select("a, w.start, w.end, w.rowtime, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg batch")
Esempio n. 9
0
 def _local_execute_func(exec_func, write_func, pickle_func, python_path):
     table_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).use_blink_planner().in_batch_mode().build())
     table_env.get_config().get_configuration().set_string(
         'parallelism.default', '1')
     table_env.get_config().set_python_executable(python_path)
     table_env.register_function(
         exec_func,
         udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING()))
     table_env.connect(FileSystem().path(write_func)) \
         .with_format(OldCsv().field('func', DataTypes.STRING())) \
         .with_schema(Schema().field('func', DataTypes.STRING())) \
         .create_temporary_table(exec_func)
     table = table_env.from_elements([(1, 'Joblib')])
     table.select('{}(_1)'.format(exec_func)).insert_into(exec_func)
     table_env.execute(exec_func)
     # decode execution result from table sink file.
     execute_result = cloudpickle.loads(
         codecs.decode(
             pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(),
             'base64'))
     # remove table sink file to clear ineffective files.
     os.remove(write_func)
     return execute_result
 def create_table_env(self):
     exec_env = ExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = BatchTableEnvironment.create(exec_env, t_config)
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
Esempio n. 11
0
def union():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = os.getcwd() + "/tmp/table_union_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "1b", "1bb"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (1, "1a", "1laa"),
                                 (1, "1b", "1bb")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["a", "b", "c"]).select("a, b, c")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.union(right)
    #result = left.union_all(right)
    result.insert_into("result")
    bt_env.execute("union")

    with open(result_file, 'r') as f:
        print(f.read())
Esempio n. 12
0
    def test_custom_env(self):
        import pyflink
        from pyflink.dataset import ExecutionEnvironment
        from pyflink.datastream import StreamExecutionEnvironment
        benv = ExecutionEnvironment.get_execution_environment()
        senv = StreamExecutionEnvironment.get_execution_environment()

        from pyflink.table import BatchTableEnvironment
        from pyflink.table import StreamTableEnvironment

        btenv = BatchTableEnvironment.create(benv)
        stenv = StreamTableEnvironment.create(senv)

        mlenv = useCustomEnv(pyflink.java_gateway.get_gateway(),
                             benv, btenv, senv, stenv)

        t = mlenv.btenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        source = TableSourceBatchOp(t)
        source.print()

        t = mlenv.stenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        source = TableSourceStreamOp(t)
        source.print()
        StreamOperator.execute()

        from pyalink.alink import env
        env._in_custom_env = False
        resetEnv()
Esempio n. 13
0
def demo02():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input'
        )
    """

    my_sink_ddl = """
        create table mySink (
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output'
        )
    """

    t_env.execute_sql(my_source_ddl)
    t_env.execute_sql(my_sink_ddl)

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
Esempio n. 14
0
def demo01():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \
        .with_format(OldCsv()
                     .field('word', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())) \
        .create_temporary_table('mySource')
    # 文件存在会报错
    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \
        .with_format(OldCsv()
                     .field_delimiter('\t')
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
Esempio n. 15
0
def table_func_python_sql_join_lateral_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a aa aaa", "aa"),
                                         ("b bb bbb", "bb"),
                                         ("c cc ccc", "cc")],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/table_func_python_sql_join_lateral_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c"],
                     [DataTypes.STRING(),
                      DataTypes.STRING(),
                      DataTypes.INT()], result_file))

    bt_env.register_java_function("split", "com.pyflink.table.Split")
    bt_env.register_table("MyTable", source_table)

    result = bt_env.sql_query(
        "SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)"
    )

    result.insert_into("result")

    bt_env.execute("table func python sql join lateral api")
Esempio n. 16
0
def scalar_func_python_table_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a", "aa"), ("b", "bb"),
                                         ("c", "cc")],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/scalar_func_python_table_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c"],
                     [DataTypes.STRING(),
                      DataTypes.INT(),
                      DataTypes.INT()], result_file))

    # register the java scalar function
    bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode")

    # use the java scalar function in Python Table API
    result = source_table.select("a, a.hashCode(), hashCode(a)")
    result.insert_into("result")
    bt_env.execute("scalar func python table api")
Esempio n. 17
0
    def test_blink_from_element(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().in_batch_mode().build())
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(),
            DataTypes.INTERVAL(DataTypes.DAY(), DataTypes.SECOND()),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(10, 0),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(
                  1970, 1, 2, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), PythonOnlyPoint(3.0, 4.0))], schema)
        t.insert_into("Results")
        t_env.execute("test")
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '1970-01-02 00:00:00.0,86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1.000000000000000000,1,2.0,{key=1.0},[65, 66, 67, 68],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Esempio n. 18
0
    def test_construct_with_batch_env(self):
        execution_environment = ExecutionEnvironment.get_execution_environment()
        batch_table_environment = BatchTableEnvironment.create(execution_environment)

        ml_environment = MLEnvironment(
            exe_env=execution_environment,
            batch_tab_env=batch_table_environment)
        self.assertEqual(ml_environment.get_execution_environment(), execution_environment)
        self.assertEqual(ml_environment.get_batch_table_environment(), batch_table_environment)
Esempio n. 19
0
 def create_env(
         self) -> (ExecutionEnvironment, TableEnvironment, StatementSet):
     exec_env = ExecutionEnvironment.get_execution_environment()
     exec_env.set_parallelism(1)
     t_config = TableConfig()
     t_env = BatchTableEnvironment.create(exec_env, t_config)
     t_env.get_config().get_configuration().set_string(
         "taskmanager.memory.task.off-heap.size", '80m')
     statement_set = t_env.create_statement_set()
     return exec_env, t_env, statement_set
Esempio n. 20
0
    def get_batch_table_environment(self) -> BatchTableEnvironment:
        """
        Get the BatchTableEnvironment. If the BatchTableEnvironment has not been set,
        it initial the BatchTableEnvironment with default Configuration.

        :return: the BatchTableEnvironment.
        """
        if self._batch_tab_env is None:
            self._batch_tab_env = BatchTableEnvironment.create(
                ExecutionEnvironment.get_execution_environment())
        return self._batch_tab_env
Esempio n. 21
0
 def create_table_env(self):
     exec_env = ExecutionEnvironment.get_execution_environment()
     t_env = BatchTableEnvironment.create(
         environment_settings=EnvironmentSettings.new_instance(
         ).in_batch_mode().use_blink_planner().build())
     t_env._j_tenv.getPlanner().getExecEnv().setParallelism(1)
     statement_set = t_env.create_statement_set()
     t_env.get_config().set_python_executable('/usr/bin/python3')
     t_env.get_config().get_configuration().set_boolean(
         "python.fn-execution.memory.managed", True)
     return exec_env, t_env, statement_set
Esempio n. 22
0
def word_count():
    result = wikipedia.page("New York City")
    content = result.summary


    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)
    print(add.add(10,5))
    print("Word Count");
    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    #sink_ddl = """
    #    create table Results(
    #        word VARCHAR,
    #        `count` BIGINT
    #    ) with (
    #        'connector.type' = 'filesystem',
    #        'format.type' = 'csv',
    #        'connector.path' = '{}'
    #   )
    #    """.format(result_path)
    t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('Results')
    #t_env.sql_update(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
Esempio n. 23
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    #t_config.set_python_executable("/opt/python38/bin/python3")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])


    table.group_by(table.word) \
        .select(table.word, expr.lit(1).count.alias('count')) \
        .execute_insert("Results").wait()
Esempio n. 24
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("len", DataTypes.INT())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    t_env.register_java_function("len", "org.apache.flink.udf.UDFLength")
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, len(word), count(1) as count") \
         .insert_into("Results")

    t_env.execute("word_count")
Esempio n. 25
0
def load(token):
    # 获取交易日期维度数
    pro = ts.pro_api(token)
    df = pro.query(
        'stock_basic',
        list_status='L',
        fields=
        'ts_code,symbol,name,area,industry,market,curr_type,list_date,is_hs')

    # 创建flink程序的入口
    env_settings = EnvironmentSettings.new_instance().in_batch_mode(
    ).use_blink_planner().build()
    table_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # 将pandas的dataframe转换成 table,并通过创建视图的方式赋予别称
    table = table_env.from_pandas(df)
    table_env.create_temporary_view("stock_info", table)
    # 声明输出的
    sink_ddl = """
    -- register a MySQL table 'users' in Flink SQL
    create table Results(
        ts_code STRING,
        symbol STRING,
        name  STRING,
        area   STRING,
        industry  STRING,
        market    STRING,
        curr_type STRING,
        list_date  STRING,
        is_hs  STRING
    ) with (
       'connector' = 'jdbc',
       'url' = 'jdbc:mysql://localhost:3306/shares?useUnicode=yes&characterEncoding=UTF-8&useSSL=false',
       'table-name' = 'dim_stock',
       'username' = 'root',
       'password' = '123456'
    )
    """
    table_env.execute_sql(sink_ddl)

    # 使用jdbc方式需要额外添加java的jar
    table_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///home/wy/shares/mysql-connector-java-5.1.49.jar;file:///home/wy/shares/flink-connector-jdbc_2.12-1.12.2.jar"
    )

    # mini模式运行的时候需要调用wait 等待 程序运行完成
    table_env.execute_sql(
        "insert into Results select * from stock_info").wait()
Esempio n. 26
0
def offset_and_fetch_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv"
    result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv"
    result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv"
    if os.path.exists(result_file_1):
        os.remove(result_file_1)
    if os.path.exists(result_file_2):
        os.remove(result_file_2)
    if os.path.exists(result_file_3):
        os.remove(result_file_3)

    bt_env.register_table_sink("result1",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_1))

    bt_env.register_table_sink("result2",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_2))

    bt_env.register_table_sink("result3",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_3))

    left = bt_env.from_elements(
        [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
        ["a", "b", "c"]).select("a, b, c")

    ordered_table = left.order_by("a.asc")

    ordered_table.fetch(5).insert_into("result1")
    ordered_table.offset(1).insert_into("result2")
    ordered_table.offset(1).fetch(2).insert_into("result3")

    bt_env.execute("offset and fetch batch")
    def test_get_execution_plan(self):
        tmp_dir = tempfile.gettempdir()
        source_path = os.path.join(tmp_dir + '/streaming.csv')
        tmp_csv = os.path.join(tmp_dir + '/streaming2.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]

        t_env = BatchTableEnvironment.create(self.env)
        csv_source = CsvTableSource(source_path, field_names, field_types)
        t_env.register_table_source("Orders", csv_source)
        t_env.register_table_sink(
            "Results", CsvTableSink(field_names, field_types, tmp_csv))
        t_env.scan("Orders").insert_into("Results")

        plan = self.env.get_execution_plan()

        json.loads(plan)
Esempio n. 28
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
    t_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])
    table.group_by(table.word) \
         .select(table.word, expr.lit(1).count.alias('count')) \
         .insert_into("Results")

    t_env.execute("word_count")
def word_count():
    f1 = open("/home/mnm/flink-1.9.1/1", "r")
    f2 = open("/home/mnm/flink-1.9.1/2", "r")
    f3 = open("/home/mnm/flink-1.9.1/3", "r")
    f4 = open("/home/mnm/flink-1.9.1/4", "r")
    f5 = open("/home/mnm/flink-1.9.1/5", "r")
    content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read()

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    t_env.connect(FileSystem().path(result_path)) \
        .with_format(OldCsv()
                     .field_delimiter(',')
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field("word", DataTypes.STRING())
                     .field("count", DataTypes.BIGINT())) \
        .register_table_sink("Results")

    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
         .group_by("word") \
         .select("word, count(1) as count") \
         .insert_into("Results")

    t_env.execute("Python batch word count")
def word_count():
    # declare a table environment, set configurations.
    env = ExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)

    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    output_file = os.path.abspath('.') + '/out.txt'
    if os.path.exists(output_file):
        try:
            if os.path.isfile(output_file):
                os.remove(output_file)
        except OSError as e:
            print("Error", e.filename, e.strerror)
    print("Results:", output_file)

    sink_ddl = """
            create table Results(
                word VARCHAR,
                `count` BIGINT
            ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
            )
        """.format(output_file)
    t_env.sql_update(sink_ddl)

    # create the source table with a single string
    # preforms some transformations, and writes the results to table Results
    content = "Who's there? I think I hear them. Stand, ho! Who's there?"
    elements = [(word, 1) for word in content.split(" ")]
    t_env.from_elements(elements, ["word", "count"]) \
        .group_by("word") \
        .select("word, count(1) as count") \
        .insert_into("Results")

    # execute the Flink Python Table API job
    t_env.execute("word_count")