Exemple #1
0
def alias_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_alias_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ], result_file))
    orders = st_env.scan("Orders")
    result = orders.alias("x, y, z, t").select("x, y, z, t")
    result.insert_into("result")
    st_env.execute("alias streaming")
def group_by_agg():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_agg.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source("Orders",
                                 CsvTableSource(source_file,
                                                ["a", "b", "c", "rowtime"],
                                                [DataTypes.STRING(),
                                                 DataTypes.INT(),
                                                 DataTypes.INT(),
                                                 DataTypes.TIMESTAMP()]))
    bt_env.register_table_sink("result",
                               CsvTableSink(["a", "b"],
                                            [DataTypes.STRING(),
                                             DataTypes.INT()],
                                            result_file))
    orders = bt_env.scan("Orders")
    result = orders.group_by("a").select("a, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg")

    with open(result_file, 'r') as f:
        print(f.read())
def group_by_window_agg_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_group_by_window_agg_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "start", "end", "rowtime", "d"], [
            DataTypes.STRING(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.TIMESTAMP(),
            DataTypes.INT()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \
        .group_by("a, w") \
        .select("a, w.start, w.end, w.rowtime, b.sum as d")
    result.insert_into("result")
    bt_env.execute("group by agg batch")
def select_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_select_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)

    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.register_table_sink(
        "result",
        CsvTableSink(["a", "c"],
                     [DataTypes.STRING(), DataTypes.INT()], result_file))
    orders = st_env.scan("Orders")
    result = orders.select("a, b")
    result.insert_into("result")
    st_env.execute("select streaming")
Exemple #5
0
def where_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = os.getcwd() + "/../result/table_where_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.where("a === 'b'")
    result.insert_into("result")
    bt_env.execute("where batch")
Exemple #6
0
def add_columns_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_add_columns_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result",
        CsvTableSink(["a", "b", "c", "rowtime", "d"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP(),
            DataTypes.STRING()
        ], result_file))
    orders = bt_env.scan("Orders")
    result = orders.add_columns("concat(a, '_sunny') as d")
    result.insert_into("result")
    bt_env.execute("add columns batch")
Exemple #7
0
    def test_select(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        with open(source_path, 'w') as f:
            lines = '1,hi,hello\n' + '2,hi,hello\n'
            f.write(lines)
            f.close()
        field_names = ["a", "b", "c"]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.STRING()
        ]
        t_env = self.t_env
        # register Orders table in table environment
        t_env.register_table_source(
            "Orders", CsvTableSource(source_path, field_names, field_types))
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        t_env.scan("Orders") \
             .select("a + 1, b, c") \
             .insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,hi,hello', '3,hi,hello']
        self.assert_equals(actual, expected)
def group_by_agg_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    # use blink table planner
    st_env = StreamTableEnvironment.create(
        s_env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())
    # use flink table planner
    # st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    st_env.connect(
        Elasticsearch()
        .version("6")
        .host("localhost", 9200, "http")
        .index("group_by_agg_streaming")
        .document_type('pyflink')
        .key_delimiter("_")
        .key_null_literal("null")
        .failure_handler_ignore()
        .disable_flush_on_checkpoint()
        .bulk_flush_max_actions(2)
        .bulk_flush_max_size("1 mb")
        .bulk_flush_interval(5000)
        ) \
        .with_schema(
            Schema()
            .field("a", DataTypes.STRING())
            .field("b", DataTypes.STRING())
        ) \
        .with_format(
           Json()
           .derive_schema()
        ) \
        .in_upsert_mode() \
        .register_table_sink("result")

    orders = st_env.scan("Orders")
    groub_by_table = orders.group_by("a").select("a, b.sum as d")
    # Because the schema of index user in elasticsearch is
    # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
    # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
    # so we need to cast the type in our demo.
    st_env.register_table("group_table", groub_by_table)
    result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table")
    result.insert_into("result")
    st_env.execute("group by agg streaming")
    def test_get_execution_plan(self):
        tmp_dir = tempfile.gettempdir()
        source_path = os.path.join(tmp_dir + '/streaming.csv')
        tmp_csv = os.path.join(tmp_dir + '/streaming2.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]

        t_env = BatchTableEnvironment.create(self.env)
        csv_source = CsvTableSource(source_path, field_names, field_types)
        t_env.register_table_source("Orders", csv_source)
        t_env.register_table_sink(
            "Results", CsvTableSink(field_names, field_types, tmp_csv))
        t_env.scan("Orders").insert_into("Results")

        plan = self.env.get_execution_plan()

        json.loads(plan)
Exemple #10
0
def distinct_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    st_env = StreamTableEnvironment.create(s_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    st_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))

    orders = st_env.scan("Orders")
    result = orders.select("a, b").distinct()
    # use custom retract sink connector
    sink = TestRetractSink(["a", "b"], [DataTypes.STRING(), DataTypes.INT()])
    st_env.register_table_sink("sink", sink)
    result.insert_into("sink")
    st_env.execute("distinct streaming")
Exemple #11
0
def slide_time_window_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_slide_time_window_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \
        .group_by("w").select("b.sum")
    result.insert_into("result")
    bt_env.execute("slide time window batch")
Exemple #12
0
def distinct_agg_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_distinct_agg_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result", CsvTableSink(["b"], [DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.group_by("a") \
        .select("b.sum.distinct as d")
    result.insert_into("result")
    bt_env.execute("distinct agg batch")