Ejemplo n.º 1
0
def table_func_python_sql_join_lateral_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)

    source_table = bt_env.from_elements([("a aa aaa", "aa"),
                                         ("b bb bbb", "bb"),
                                         ("c cc ccc", "cc")],
                                        ["a", "b"]).select("a, b")

    result_file = "/tmp/table_func_python_sql_join_lateral_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c"],
                     [DataTypes.STRING(),
                      DataTypes.STRING(),
                      DataTypes.INT()], result_file))

    bt_env.register_java_function("split", "com.pyflink.table.Split")
    bt_env.register_table("MyTable", source_table)

    result = bt_env.sql_query(
        "SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)"
    )

    result.insert_into("result")

    bt_env.execute("table func python sql join lateral api")
Ejemplo n.º 2
0
def minus_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = "/tmp/table_minus_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"),
                                 (3, "", "lcc"), (2, "lb", "lbb"),
                                 (1, "ra", "raa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"),
                                  (3, "rc", "rcc"), (1, "ra", "raa")],
                                 ["a", "b", "c"]).select("a, b, c")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.minus(right)
    result.insert_into("result")
    bt_env.execute("minus batch")
Ejemplo n.º 3
0
def select_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_select_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)

    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "c"],
                     [DataTypes.STRING(), DataTypes.INT()], result_file))
    orders = st_env.scan("Orders")
    result = orders.select("a, b")
    result.insert_into("result")
    st_env.execute("select streaming")
    def input_output_table():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        statement_set = table_env.create_statement_set()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/input_output.py"
        prop = {}
        func = "map_func"
        env_path = None
        prop[
            MLCONSTANTS.
            ENCODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding"
        prop[
            MLCONSTANTS.
            DECODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding"
        inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING"
        prop["sys:csv_encode_types"] = inputSb
        prop["sys:csv_decode_types"] = inputSb
        prop[MLCONSTANTS.PYTHON_VERSION] = "3.7"
        source_file = os.getcwd() + "/../../src/test/resources/input.csv"
        sink_file = os.getcwd() + "/../../src/test/resources/output.csv"
        table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING()
        ])
        table_env.register_table_source("source", table_source)
        input_tb = table_env.from_path("source")
        output_schema = TableSchema(["a", "b", "c", "d", "e"], [
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING()
        ])
        sink = CsvTableSink(["a", "b", "c", "d", "e"], [
            DataTypes.INT(),
            DataTypes.BIGINT(),
            DataTypes.FLOAT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING()
        ],
                            sink_file,
                            write_mode=WriteMode.OVERWRITE)
        table_env.register_table_sink("table_row_sink", sink)
        tf_config = TFConfig(work_num, ps_num, prop, python_file, func,
                             env_path)
        output_table = train(stream_env, table_env, statement_set, input_tb,
                             tf_config, output_schema)

        # output_table = inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        statement_set.add_insert("table_row_sink", output_table)
        job_client = statement_set.execute().get_job_client()
        if job_client is not None:
            job_client.get_job_execution_result(
                user_class_loader=None).result()
Ejemplo n.º 5
0
    def test_table_environment_with_blink_planner(self):
        self.env.set_parallelism(1)
        t_env = StreamTableEnvironment.create(
            self.env,
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/result.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.scan("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        result.insert_into("sink")

        t_env.execute("blink_test")

        results = []
        with open(sink_path, 'r') as f:
            results.append(f.readline())
            results.append(f.readline())

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Ejemplo n.º 6
0
def group_by_window_agg_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_window_agg_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "start", "end", "rowtime", "d"], [
            DataTypes.STRING(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.INT()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("a, w") \
        .select("a, w.start, w.end, w.rowtime, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg batch")
Ejemplo n.º 7
0
    def test_table_environment_with_blink_planner(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).in_batch_mode().use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/results')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.scan("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        result.insert_into("sink")

        t_env.execute("blink_test")

        results = []
        for root, dirs, files in os.walk(sink_path):
            for sub_file in files:
                with open(os.path.join(root, sub_file), 'r') as f:
                    line = f.readline()
                    while line is not None and line != '':
                        results.append(line)
                        line = f.readline()

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Ejemplo n.º 8
0
def filter_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_filter_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ], result_file))
    orders = st_env.scan("Orders")
    result = orders.filter("b % 2 === 0")
    result.insert_into("result")
    st_env.execute("filter streaming")
Ejemplo n.º 9
0
def union():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = os.getcwd() + "/tmp/table_union_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "1b", "1bb"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (1, "1a", "1laa"),
                                 (1, "1b", "1bb")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["a", "b", "c"]).select("a, b, c")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.union(right)
    #result = left.union_all(right)
    result.insert_into("result")
    bt_env.execute("union")

    with open(result_file, 'r') as f:
        print(f.read())
Ejemplo n.º 10
0
def select():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_select.csv"
    if os.path.exists(result_file):
        os.remove(result_file)

    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "c"],
                     [DataTypes.STRING(), DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.select("a, b")
    result.insert_into("result")
    bt_env.execute("select")

    with open(result_file, 'r') as f:
        print(f.read())
Ejemplo n.º 11
0
 def test_execute(self):
     tmp_dir = tempfile.gettempdir()
     field_names = ['a', 'b', 'c']
     field_types = [
         DataTypes.BIGINT(),
         DataTypes.STRING(),
         DataTypes.STRING()
     ]
     t_env = StreamTableEnvironment.create(self.env)
     t_env.register_table_sink(
         'Results',
         CsvTableSink(
             field_names, field_types,
             os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time())))))
     t_env.insert_into(
         'Results',
         t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']))
     execution_result = t_env.execute('test_stream_execute')
     self.assertIsNotNone(execution_result.get_job_id())
     self.assertTrue(execution_result.is_job_execution_result())
     self.assertIsNotNone(
         execution_result.get_job_execution_result().get_job_id())
     self.assertIsNotNone(execution_result.get_net_runtime())
     self.assertEqual(len(execution_result.get_all_accumulator_results()),
                      0)
     self.assertIsNone(
         execution_result.get_accumulator_result('accumulator'))
     self.assertIsNotNone(execution_result.to_string())
Ejemplo n.º 12
0
def add_columns_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_add_columns_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime", "d"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP(),
            DataTypes.STRING()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.add_columns("concat(a, '_sunny') as d")
    result.insert_into("result")
    bt_env.execute("add columns batch")
Ejemplo n.º 13
0
def left_outer_join_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file = "/tmp/table_left_outer_join_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = bt_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.left_outer_join(right, "a = d").select("a, b, e")
    result.insert_into("result")
    bt_env.execute("left outer join batch")
Ejemplo n.º 14
0
def inner_join_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/table_inner_join_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"),
                                 (3, None, "3aa"), (2, "4b", "4bb"),
                                 (5, "5a", "5aa")],
                                ["a", "b", "c"]).select("a, b, c")
    right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"),
                                  (1, "3b", "3bb"), (4, "4b", "4bb")],
                                 ["d", "e", "f"]).select("d, e, f")
    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b", "c"],
            [DataTypes.BIGINT(),
             DataTypes.STRING(),
             DataTypes.STRING()], result_file))

    result = left.join(right).where("a = d").select("a, b, e")
    result.insert_into("result")
    st_env.execute("inner join streaming")
Ejemplo n.º 15
0
def where_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = os.getcwd() + "/../result/table_where_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.where("a === 'b'")
    result.insert_into("result")
    bt_env.execute("where batch")
Ejemplo n.º 16
0
def offset_and_fetch_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv"
    result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv"
    result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv"
    if os.path.exists(result_file_1):
        os.remove(result_file_1)
    if os.path.exists(result_file_2):
        os.remove(result_file_2)
    if os.path.exists(result_file_3):
        os.remove(result_file_3)

    bt_env.register_table_sink("result1",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_1))

    bt_env.register_table_sink("result2",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_2))

    bt_env.register_table_sink("result3",
                               CsvTableSink(["a", "b", "c"],
                                            [DataTypes.BIGINT(),
                                             DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file_3))

    left = bt_env.from_elements(
        [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
        ["a", "b", "c"]).select("a, b, c")

    ordered_table = left.order_by("a.asc")

    ordered_table.fetch(5).insert_into("result1")
    ordered_table.offset(1).insert_into("result2")
    ordered_table.offset(1).fetch(2).insert_into("result3")

    bt_env.execute("offset and fetch batch")
Ejemplo n.º 17
0
    def test_explain_with_multi_sinks(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "sink1",
            CsvTableSink(field_names, field_types, "path1"))
        t_env.register_table_sink(
            "sink2",
            CsvTableSink(field_names, field_types, "path2"))

        t_env.sql_update("insert into sink1 select * from %s where a > 100" % source)
        t_env.sql_update("insert into sink2 select * from %s where a < 100" % source)

        actual = t_env.explain(extended=True)
        self.assertIsInstance(actual, str)
Ejemplo n.º 18
0
    def test_explain_with_multi_sinks(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "sink1",
            CsvTableSink(field_names, field_types, "path1"))
        t_env.register_table_sink(
            "sink2",
            CsvTableSink(field_names, field_types, "path2"))

        stmt_set = t_env.create_statement_set()
        stmt_set.add_insert_sql("insert into sink1 select * from %s where a > 100" % source)
        stmt_set.add_insert_sql("insert into sink2 select * from %s where a < 100" % source)

        actual = stmt_set.explain(ExplainDetail.ESTIMATED_COST, ExplainDetail.CHANGELOG_MODE)
        self.assertIsInstance(actual, str)
Ejemplo n.º 19
0
    def test_statement_set(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "sink1",
            CsvTableSink(field_names, field_types, "path1"))
        t_env.register_table_sink(
            "sink2",
            CsvTableSink(field_names, field_types, "path2"))

        stmt_set = t_env.create_statement_set()

        stmt_set.add_insert_sql("insert into sink1 select * from %s where a > 100" % source) \
            .add_insert("sink2", source.filter("a < 100"))

        actual = stmt_set.explain()
        assert isinstance(actual, str)
Ejemplo n.º 20
0
    def test_explain_with_multi_sinks_with_blink_planner(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).in_batch_mode().use_blink_planner().build())
        source = t_env.from_elements([(1, "Hi", "Hello"),
                                      (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env.register_table_sink(
            "sink1", CsvTableSink(field_names, field_types, "path1"))
        t_env.register_table_sink(
            "sink2", CsvTableSink(field_names, field_types, "path2"))

        t_env.sql_update("insert into sink1 select * from %s where a > 100" %
                         source)
        t_env.sql_update("insert into sink2 select * from %s where a < 100" %
                         source)

        actual = t_env.explain(extended=True)
        self.assertIsInstance(actual, (str, unicode))
Ejemplo n.º 21
0
 def execute(self, function_context: FlinkFunctionContext,
             input_table: Table) -> None:
     t_env = function_context.get_table_env()
     statement_set = function_context.get_statement_set()
     dummy_output_path = function_context.get_example_meta().batch_uri
     if os.path.exists(dummy_output_path):
         if os.path.isdir(dummy_output_path):
             shutil.rmtree(dummy_output_path)
         else:
             os.remove(dummy_output_path)
     sink = CsvTableSink(
         ['a', 'b', 'c'],
         [DataTypes.STRING(),
          DataTypes.STRING(),
          DataTypes.STRING()], dummy_output_path, ';')
     t_env.register_table_sink('mySink', sink)
     statement_set.add_insert("mySink", input_table)
Ejemplo n.º 22
0
    def execute(self, function_context: FlinkFunctionContext,
                input_table: Table) -> None:
        example_meta: ExampleMeta = function_context.get_example_meta()
        output_file = example_meta.batch_uri
        if os.path.exists(output_file):
            if os.path.isdir(output_file):
                shutil.rmtree(output_file)
            else:
                os.remove(output_file)
        t_env = function_context.get_table_env()
        statement_set = function_context.get_statement_set()
        sink = CsvTableSink(
            ['a', 'b'],
            [DataTypes.STRING(), DataTypes.STRING()], output_file, ';')

        t_env.register_table_sink('mySink', sink)
        statement_set.add_insert('mySink', input_table)
    def test_get_execution_plan(self):
        tmp_dir = tempfile.gettempdir()
        source_path = os.path.join(tmp_dir + '/streaming.csv')
        tmp_csv = os.path.join(tmp_dir + '/streaming2.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]

        t_env = BatchTableEnvironment.create(self.env)
        csv_source = CsvTableSource(source_path, field_names, field_types)
        t_env.register_table_source("Orders", csv_source)
        t_env.register_table_sink(
            "Results", CsvTableSink(field_names, field_types, tmp_csv))
        t_env.scan("Orders").insert_into("Results")

        plan = self.env.get_execution_plan()

        json.loads(plan)
Ejemplo n.º 24
0
def custom_test_source_demo():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/custom_test_source_demo.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
    st_env.connect(custom_connector) \
        .with_schema(
        Schema()
            .field("a", DataTypes.STRING())
    ).register_table_source("source")

    st_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.STRING()], result_file))
    orders = st_env.scan("source")
    orders.insert_into("result")
    st_env.execute("custom test source demo")
Ejemplo n.º 25
0
def aggregate_func_python_table_api():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2),
                                         ("a", 5, 2)],
                                        ["user", "points", "level"])

    result_file = "/tmp/aggregate_func_python_table_api.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.BIGINT()], result_file))
    bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg")
    result = source_table.group_by("user").select(
        "user, wAvg(points, level) as avgPoints")
    result.insert_into("result")
    bt_env.execute("aggregate func python table api")
Ejemplo n.º 26
0
def slide_time_window_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_slide_time_window_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \
        .group_by("w").select("b.sum")
    result.insert_into("result")
    bt_env.execute("slide time window batch")
Ejemplo n.º 27
0
def distinct_agg_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_distinct_agg_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result", CsvTableSink(["b"], [DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.group_by("a") \
        .select("b.sum.distinct as d")
    result.insert_into("result")
    bt_env.execute("distinct agg batch")
Ejemplo n.º 28
0
        .connect(custom_connector) \
        .with_format(
        custom_format
    ) \
        .with_schema(  # declare the schema of the table
        Schema()
            .field("proctime", DataTypes.TIMESTAMP())
            .proctime()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
            .field("c", DataTypes.STRING())
    ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file))

    st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("custom kafka source demo")
    # cat /tmp/custom_kafka_source_demo.csv
    # a,3
    # b,4
    # a 5
Ejemplo n.º 29
0
def tumble_time_window_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/tumble_time_window_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.STRING()],
                                            result_file))

    st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("tumble time window streaming")
Ejemplo n.º 30
0
def custom_kafka_source_demo():
    custom_connector = CustomConnectorDescriptor('kafka', 1, True) \
        .property('connector.topic', 'user') \
        .property('connector.properties.0.key', 'zookeeper.connect') \
        .property('connector.properties.0.value', 'localhost:2181') \
        .property('connector.properties.1.key', 'bootstrap.servers') \
        .property('connector.properties.1.value', 'localhost:9092') \
        .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'})

    # the key is 'format.json-schema'
    custom_format = CustomFormatDescriptor('json', 1) \
        .property('format.json-schema',
                  "{"
                  "  type: 'object',"
                  "  properties: {"
                  "    a: {"
                  "      type: 'string'"
                  "    },"
                  "    b: {"
                  "      type: 'string'"
                  "    },"
                  "    c: {"
                  "      type: 'string'"
                  "    },"
                  "    time: {"
                  "      type: 'string',"
                  "      format: 'date-time'"
                  "    }"
                  "  }"
                  "}") \
        .properties({'format.fail-on-missing-field': 'true'})

    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/custom_kafka_source_demo.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(custom_connector) \
        .with_format(
            custom_format
        ) \
        .with_schema(  # declare the schema of the table
            Schema()
            .field("proctime", DataTypes.TIMESTAMP())
            .proctime()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
            .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.STRING()], result_file))

    st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("custom kafka source demo")