Esempio n. 1
0
def demo01():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \
        .with_format(OldCsv()
                     .field('word', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())) \
        .create_temporary_table('mySource')
    # 文件存在会报错
    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \
        .with_format(OldCsv()
                     .field_delimiter('\t')
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
Esempio n. 2
0
def demo02():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input'
        )
    """

    my_sink_ddl = """
        create table mySink (
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output'
        )
    """

    t_env.execute_sql(my_source_ddl)
    t_env.execute_sql(my_sink_ddl)

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
Esempio n. 3
0
    def test_tumble_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))
        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b TIMESTAMP(3), c FLOAT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        tumble_window = Tumble.over(lit(1).hours) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(tumble_window) \
            .group_by(col("w")) \
            .select(col("w").start, col("w").end, mean_udaf(t.b)) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11T03:00, 2018-03-11T04:00, 2.2]",
                            "+I[2018-03-11T04:00, 2018-03-11T05:00, 8.0]"])
Esempio n. 4
0
    def test_tumbling_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:30:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
        ]
        source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.create_temporary_system_function(
            "my_count", CountDistinctAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c INT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT, e BIGINT)
        WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \
            .group_by(t.a, col("w")) \
            .select(t.a,
                    col("w").start,
                    col("w").end,
                    t.c.count.alias("c"),
                    call("my_count", t.c).alias("d")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2, 1]",
            "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 1, 1]",
            "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2, 2]",
            "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 1, 1]"
        ])
Esempio n. 5
0
    def test_session_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"])
        result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\
            .group_by(expr.col('w'), expr.col('c')).select(t.b.sum)

        query_operation = result._j_table.getQueryOperation().getChildren().get(0)
        self.assertEqual('[c]', query_operation.getGroupingExpressions().toString())
        self.assertEqual('SessionWindow(field: [a], gap: [1000])',
                         query_operation.getGroupWindow().asSummaryString())
Esempio n. 6
0
    def test_session_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_session_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_count", CountAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]",
                            "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]",
                            "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]",
                            "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
Esempio n. 7
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t = t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1))

        result = self.collect(t)
        self.assertEqual(result, ["3,1,1", "7,2,1", "4,3,1"])
Esempio n. 8
0
def demo01():
    # environment configuration
    t_env = BatchTableEnvironment.create(environment_settings=EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build())

    # register Orders table and Result table sink in table environment
    source_data_path = "/path/to/source/directory/"
    result_data_path = "/path/to/result/directory/"
    source_ddl = f"""
            create table Orders(
                a VARCHAR,
                b BIGINT,
                c BIGINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND
            ) with (
                'connector' = 'filesystem',
                'format' = 'csv',
                'path' = '{source_data_path}'
            )
            """
    t_env.execute_sql(source_ddl)

    sink_ddl = f"""
        create table `Result`(
            a VARCHAR,
            cnt BIGINT
        ) with (
            'connector' = 'filesystem',
            'format' = 'csv',
            'path' = '{result_data_path}'
        )
        """
    t_env.execute_sql(sink_ddl)

    # specify table program
    orders = t_env.from_path("Orders")  # schema (a, b, c, rowtime)

    orders.group_by("a").select(orders.a, orders.b.count.alias('cnt')).execute_insert("result").wait()

    orders.where(orders.a == 'red')
    orders.filter(orders.b % 2 == 0)
    orders.add_columns(concat(orders.c, 'sunny'))
    orders.add_or_replace_columns(concat(orders.c, 'sunny').alias('desc'))
    orders.drop_columns(orders.b, orders.c)
    orders.rename_columns(orders.b.alias('b2'), orders.c.alias('c2'))
    orders.group_by(orders.a).select(orders.a, orders.b.sum.alias('d'))

    # tab.group_by(tab.key).select(tab.key, tab.value.avg.alias('average'))
    # tab.group_by("key").select("key, value.avg as average")
    result = orders.filter(orders.a.is_not_null & orders.b.is_not_null & orders.c.is_not_null) \
        .select(orders.a.lower_case.alias('a'), orders.b, orders.rowtime) \
        .window(Tumble.over(lit(1).hour).on(orders.rowtime).alias("hourly_window")) \
        .group_by(col('hourly_window'), col('a')) \
        .select(col('a'), col('hourly_window').end.alias('hour'), col('b').avg.alias('avg_billing_amount'))
    """
Esempio n. 9
0
    def test_expressions(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')

        self.assertEqual('10', str(lit(10, DataTypes.INT(False))))
        self.assertEqual('rangeTo(1, 2)', str(range_(1, 2)))
        self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3)))
        self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3)))

        from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \
            CURRENT_RANGE
        self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW))
        self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE))
        self.assertEqual('currentRow()', str(CURRENT_ROW))
        self.assertEqual('currentRange()', str(CURRENT_RANGE))

        self.assertEqual('currentDate()', str(current_date()))
        self.assertEqual('currentTime()', str(current_time()))
        self.assertEqual('currentTimestamp()', str(current_timestamp()))
        self.assertEqual('localTime()', str(local_time()))
        self.assertEqual('localTimestamp()', str(local_timestamp()))
        self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0)))
        self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, "
                         "cast('3:30:00', TIME(0)), 7200000)",
                         str(temporal_overlaps(
                             lit("2:55:00").to_time,
                             lit(1).hours,
                             lit("3:30:00").to_time,
                             lit(2).hours)))
        self.assertEqual("dateFormat(time, '%Y, %d %M')",
                         str(date_format(col("time"), "%Y, %d %M")))
        self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))",
                         str(timestamp_diff(
                             TimePointUnit.DAY,
                             lit("2016-06-15").to_date,
                             lit("2016-06-18").to_date)))
        self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3)))
        self.assertEqual("row('key1', 1)", str(row("key1", 1)))
        self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)",
                         str(map_("key1", 1, "key2", 2, "key3", 3)))
        self.assertEqual('4', str(row_interval(4)))
        self.assertEqual('pi()', str(pi()))
        self.assertEqual('e()', str(e()))
        self.assertEqual('rand(4)', str(rand(4)))
        self.assertEqual('randInteger(4)', str(rand_integer(4)))
        self.assertEqual('atan2(1, 2)', str(atan2(1, 2)))
        self.assertEqual('minusPrefix(a)', str(negative(expr1)))
        self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3)))
        self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3)))
        self.assertEqual('uuid()', str(uuid()))
        self.assertEqual('null', str(null_of(DataTypes.BIGINT())))
        self.assertEqual('log(a)', str(log(expr1)))
        self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3)))
        self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3)))
        self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
Esempio n. 10
0
def pandas_udaf():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    @udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
    def mean_udaf(v):
        return v.mean()

    # define the tumble window operation
    table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, mean_udaf(table.price), col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Esempio n. 11
0
def word_count(input_path, output_path):
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    # write all the data to one file
    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    # define the source
    if input_path is not None:
        t_env.create_temporary_table(
            'source',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column(
                    'word', DataTypes.STRING()).build()).option(
                        'path', input_path).format('csv').build())
        tab = t_env.from_path('source')
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        tab = t_env.from_elements(
            map(lambda i: (i, ), word_count_data),
            DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())]))

    # define the sink
    if output_path is not None:
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count',
                    DataTypes.BIGINT()).build()).option('path', output_path).
            format(FormatDescriptor.for_format('canal-json').build()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('print').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count', DataTypes.BIGINT()).build()).build())

    @udtf(result_types=[DataTypes.STRING()])
    def split(line: Row):
        for s in line[0].split():
            yield Row(s)

    # compute word count
    tab.flat_map(split).alias('word') \
       .group_by(col('word')) \
       .select(col('word'), lit(1).count) \
       .execute_insert('sink') \
       .wait()
Esempio n. 12
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
Esempio n. 13
0
    def test_left_outer_join_lateral_with_join_predicate(self):
        t_env = self.t_env
        t_env.create_java_temporary_system_function("split",
                                                    "org.apache.flink.table.utils.TableFunc1")
        source = t_env.from_elements([("1", "1#3#5#7"), ("2", "2#4#6#8")], ["id", "words"])

        # only support "true" as the join predicate currently
        result = source.left_outer_join_lateral(expr.call('split', source.words).alias('word'),
                                                expr.lit(True))

        query_operation = result._j_table.getQueryOperation()
        self.assertEqual('LEFT_OUTER', query_operation.getJoinType().toString())
        self.assertTrue(query_operation.isCorrelated())
        self.assertEqual('true', query_operation.getCondition().toString())
Esempio n. 14
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    #t_config.set_python_executable("/opt/python38/bin/python3")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])


    table.group_by(table.word) \
        .select(table.word, expr.lit(1).count.alias('count')) \
        .execute_insert("Results").wait()
Esempio n. 15
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        sink_table_ddl = """
                CREATE TABLE Results(a BIGINT, b BIGINT, c INT) WITH ('connector'='test-sink')
                """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
Esempio n. 16
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by("w") \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select("w.rowtime, d, e") \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["2018-03-11 03:59:59.999,2.2,3",
                            "2018-03-11 04:59:59.999,8.0,8"])
Esempio n. 17
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
    t_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])
    table.group_by(table.word) \
         .select(table.word, expr.lit(1).count.alias('count')) \
         .insert_into("Results")

    t_env.execute("word_count")
Esempio n. 18
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.add_jars("file:///app/src/kafka-clients-2.8.0.jar")
    env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar")
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
    config = env.get_checkpoint_config()
    config.enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION)

    st_env = StreamTableEnvironment.create(
        env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())

    print("register kafka source")
    register_kafka_source(st_env)
    print("register transaction sinks")
    register_transactions_sink_into_csv(st_env)


    st_env.from_path("source_tbl") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("total_sink")

    st_env.from_path("source_tbl") \
       .where("message = 'dolorem'") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""
                   count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("grep_sink")

    st_env.from_path("source_tbl") \
        .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
        .group_by(col("w"), col("message")) \
        .select("""
                    count(message) as total,
                    message,
                    w.end as end_time
                   """) \
        .insert_into("topk_sink")

    st_env.execute("app")
Esempio n. 19
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        print(t.get_schema())
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by(expr.col("w")) \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11 03:59:59.999, 2.2, 3]",
                            "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
Esempio n. 20
0
    def test_tumble_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        tumble_window = Tumble.over(lit(1).hours) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(tumble_window) \
            .group_by(col("w")) \
            .select(col("w").start, col("w").end, mean_udaf(t.b)) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.2]",
                            "+I[2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0]"])
Esempio n. 21
0
    def test_tumbling_group_window_over_time(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Tumble
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a,
                    col("w").start,
                    col("w").end,
                    col("w").rowtime,
                    mean_udaf(t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.5]",
            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 2018-03-11 04:59:59.999, 8.0]",
            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.0]",
            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.0]",
        ])
        os.remove(source_path)
Esempio n. 22
0
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.expressions import lit

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('/tmp/input')) \
    .with_format(OldCsv()
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

tab = t_env.from_path('mySource')
tab.group_by(tab.word) \
   .select(tab.word, lit(1).count) \
   .execute_insert('mySink').wait()
Esempio n. 23
0
    def test_expression(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')
        expr4 = col('d')
        expr5 = lit(10)

        # comparison functions
        self.assertEqual('equals(a, b)', str(expr1 == expr2))
        self.assertEqual('mod(2, b)', str(2 % expr2))
        self.assertEqual('notEquals(a, b)', str(expr1 != expr2))
        self.assertEqual('lessThan(a, b)', str(expr1 < expr2))
        self.assertEqual('lessThanOrEqual(a, b)', str(expr1 <= expr2))
        self.assertEqual('greaterThan(a, b)', str(expr1 > expr2))
        self.assertEqual('greaterThanOrEqual(a, b)', str(expr1 >= expr2))

        # logic functions
        self.assertEqual('and(a, b)', str(expr1 & expr2))
        self.assertEqual('or(a, b)', str(expr1 | expr2))
        self.assertEqual('isNotTrue(a)', str(expr1.is_not_true))
        self.assertEqual('isNotTrue(a)', str(~expr1))

        # arithmetic functions
        self.assertEqual('plus(a, b)', str(expr1 + expr2))
        self.assertEqual('plus(2, b)', str(2 + expr2))
        self.assertEqual('plus(cast(b, DATE), 2)', str(expr2.to_date + 2))
        self.assertEqual('minus(a, b)', str(expr1 - expr2))
        self.assertEqual('minus(cast(b, DATE), 2)', str(expr2.to_date - 2))
        self.assertEqual('times(a, b)', str(expr1 * expr2))
        self.assertEqual('divide(a, b)', str(expr1 / expr2))
        self.assertEqual('mod(a, b)', str(expr1 % expr2))
        self.assertEqual('power(a, b)', str(expr1**expr2))
        self.assertEqual('minusPrefix(a)', str(-expr1))

        self.assertEqual('exp(a)', str(expr1.exp))
        self.assertEqual('log10(a)', str(expr1.log10))
        self.assertEqual('log2(a)', str(expr1.log2))
        self.assertEqual('ln(a)', str(expr1.ln))
        self.assertEqual('log(a)', str(expr1.log()))
        self.assertEqual('cosh(a)', str(expr1.cosh))
        self.assertEqual('sinh(a)', str(expr1.sinh))
        self.assertEqual('sin(a)', str(expr1.sin))
        self.assertEqual('cos(a)', str(expr1.cos))
        self.assertEqual('tan(a)', str(expr1.tan))
        self.assertEqual('cot(a)', str(expr1.cot))
        self.assertEqual('asin(a)', str(expr1.asin))
        self.assertEqual('acos(a)', str(expr1.acos))
        self.assertEqual('atan(a)', str(expr1.atan))
        self.assertEqual('tanh(a)', str(expr1.tanh))
        self.assertEqual('degrees(a)', str(expr1.degrees))
        self.assertEqual('radians(a)', str(expr1.radians))
        self.assertEqual('sqrt(a)', str(expr1.sqrt))
        self.assertEqual('abs(a)', str(expr1.abs))
        self.assertEqual('abs(a)', str(abs(expr1)))
        self.assertEqual('sign(a)', str(expr1.sign))
        self.assertEqual('round(a, b)', str(expr1.round(expr2)))
        self.assertEqual('between(a, b, c)', str(expr1.between(expr2, expr3)))
        self.assertEqual('notBetween(a, b, c)',
                         str(expr1.not_between(expr2, expr3)))
        self.assertEqual('ifThenElse(a, b, c)', str(expr1.then(expr2, expr3)))

        self.assertEqual('isNull(a)', str(expr1.is_null))
        self.assertEqual('isNotNull(a)', str(expr1.is_not_null))
        self.assertEqual('isTrue(a)', str(expr1.is_true))
        self.assertEqual('isFalse(a)', str(expr1.is_false))
        self.assertEqual('isNotTrue(a)', str(expr1.is_not_true))
        self.assertEqual('isNotFalse(a)', str(expr1.is_not_false))
        self.assertEqual('distinct(a)', str(expr1.distinct))
        self.assertEqual('sum(a)', str(expr1.sum))
        self.assertEqual('sum0(a)', str(expr1.sum0))
        self.assertEqual('min(a)', str(expr1.min))
        self.assertEqual('max(a)', str(expr1.max))
        self.assertEqual('count(a)', str(expr1.count))
        self.assertEqual('avg(a)', str(expr1.avg))
        self.assertEqual('first_value(a)', str(expr1.first_value))
        self.assertEqual('last_value(a)', str(expr1.last_value))
        self.assertEqual('stddevPop(a)', str(expr1.stddev_pop))
        self.assertEqual('stddevSamp(a)', str(expr1.stddev_samp))
        self.assertEqual('varPop(a)', str(expr1.var_pop))
        self.assertEqual('varSamp(a)', str(expr1.var_samp))
        self.assertEqual('collect(a)', str(expr1.collect))
        self.assertEqual("as(a, 'a', 'b', 'c')",
                         str(expr1.alias('a', 'b', 'c')))
        self.assertEqual('cast(a, INT)', str(expr1.cast(DataTypes.INT())))
        self.assertEqual('asc(a)', str(expr1.asc))
        self.assertEqual('desc(a)', str(expr1.desc))
        self.assertEqual('in(a, b, c, d)', str(expr1.in_(expr2, expr3, expr4)))
        self.assertEqual('start(a)', str(expr1.start))
        self.assertEqual('end(a)', str(expr1.end))
        self.assertEqual('bin(a)', str(expr1.bin))
        self.assertEqual('hex(a)', str(expr1.hex))
        self.assertEqual('truncate(a, 3)', str(expr1.truncate(3)))

        # string functions
        self.assertEqual('substring(a, b, 3)', str(expr1.substring(expr2, 3)))
        self.assertEqual("trim(true, false, ' ', a)",
                         str(expr1.trim_leading()))
        self.assertEqual("trim(false, true, ' ', a)",
                         str(expr1.trim_trailing()))
        self.assertEqual("trim(true, true, ' ', a)", str(expr1.trim()))
        self.assertEqual('replace(a, b, c)', str(expr1.replace(expr2, expr3)))
        self.assertEqual('charLength(a)', str(expr1.char_length))
        self.assertEqual('upper(a)', str(expr1.upper_case))
        self.assertEqual('lower(a)', str(expr1.lower_case))
        self.assertEqual('initCap(a)', str(expr1.init_cap))
        self.assertEqual("like(a, 'Jo_n%')", str(expr1.like('Jo_n%')))
        self.assertEqual("similar(a, 'A+')", str(expr1.similar('A+')))
        self.assertEqual('position(a, b)', str(expr1.position(expr2)))
        self.assertEqual('lpad(a, 4, b)', str(expr1.lpad(4, expr2)))
        self.assertEqual('rpad(a, 4, b)', str(expr1.rpad(4, expr2)))
        self.assertEqual('overlay(a, b, 6, 2)', str(expr1.overlay(expr2, 6,
                                                                  2)))
        self.assertEqual("regexpReplace(a, b, 'abc')",
                         str(expr1.regexp_replace(expr2, 'abc')))
        self.assertEqual('regexpExtract(a, b, 3)',
                         str(expr1.regexp_extract(expr2, 3)))
        self.assertEqual('fromBase64(a)', str(expr1.from_base64))
        self.assertEqual('toBase64(a)', str(expr1.to_base64))
        self.assertEqual('ltrim(a)', str(expr1.ltrim))
        self.assertEqual('rtrim(a)', str(expr1.rtrim))
        self.assertEqual('repeat(a, 3)', str(expr1.repeat(3)))
        self.assertEqual("over(a, 'w')", str(expr1.over('w')))

        # temporal functions
        self.assertEqual('cast(a, DATE)', str(expr1.to_date))
        self.assertEqual('cast(a, TIME(0))', str(expr1.to_time))
        self.assertEqual('cast(a, TIMESTAMP(3))', str(expr1.to_timestamp))
        self.assertEqual('extract(YEAR, a)',
                         str(expr1.extract(TimeIntervalUnit.YEAR)))
        self.assertEqual('floor(a, YEAR)',
                         str(expr1.floor(TimeIntervalUnit.YEAR)))
        self.assertEqual('ceil(a)', str(expr1.ceil()))

        # advanced type helper functions
        self.assertEqual("get(a, 'col')", str(expr1.get('col')))
        self.assertEqual('flatten(a)', str(expr1.flatten))
        self.assertEqual('at(a, 0)', str(expr1.at(0)))
        self.assertEqual('cardinality(a)', str(expr1.cardinality))
        self.assertEqual('element(a)', str(expr1.element))

        # time definition functions
        self.assertEqual('rowtime(a)', str(expr1.rowtime))
        self.assertEqual('proctime(a)', str(expr1.proctime))
        self.assertEqual('120', str(expr5.year))
        self.assertEqual('120', str(expr5.years))
        self.assertEqual('30', str(expr5.quarter))
        self.assertEqual('30', str(expr5.quarters))
        self.assertEqual('10', str(expr5.month))
        self.assertEqual('10', str(expr5.months))
        self.assertEqual('6048000000', str(expr5.week))
        self.assertEqual('6048000000', str(expr5.weeks))
        self.assertEqual('864000000', str(expr5.day))
        self.assertEqual('864000000', str(expr5.days))
        self.assertEqual('36000000', str(expr5.hour))
        self.assertEqual('36000000', str(expr5.hours))
        self.assertEqual('600000', str(expr5.minute))
        self.assertEqual('600000', str(expr5.minutes))
        self.assertEqual('10000', str(expr5.second))
        self.assertEqual('10000', str(expr5.seconds))
        self.assertEqual('10', str(expr5.milli))
        self.assertEqual('10', str(expr5.millis))

        # hash functions
        self.assertEqual('md5(a)', str(expr1.md5))
        self.assertEqual('sha1(a)', str(expr1.sha1))
        self.assertEqual('sha224(a)', str(expr1.sha224))
        self.assertEqual('sha256(a)', str(expr1.sha256))
        self.assertEqual('sha384(a)', str(expr1.sha384))
        self.assertEqual('sha512(a)', str(expr1.sha512))
        self.assertEqual('sha2(a, 224)', str(expr1.sha2(224)))

        # json functions
        self.assertEqual("IS_JSON('42')", str(lit('42').is_json()))
        self.assertEqual("IS_JSON('42', SCALAR)",
                         str(lit('42').is_json(JsonType.SCALAR)))

        self.assertEqual("JSON_EXISTS('{}', '$.x')",
                         str(lit('{}').json_exists('$.x')))
        self.assertEqual(
            "JSON_EXISTS('{}', '$.x', FALSE)",
            str(lit('{}').json_exists('$.x', JsonExistsOnError.FALSE)))

        self.assertEqual(
            "JSON_VALUE('{}', '$.x', STRING, NULL, null, NULL, null)",
            str(lit('{}').json_value('$.x')))
        self.assertEqual(
            "JSON_VALUE('{}', '$.x', INT, DEFAULT, 42, ERROR, null)",
            str(
                lit('{}').json_value('$.x', DataTypes.INT(),
                                     JsonValueOnEmptyOrError.DEFAULT, 42,
                                     JsonValueOnEmptyOrError.ERROR, None)))

        self.assertEqual(
            "JSON_QUERY('{}', '$.x', WITHOUT_ARRAY, NULL, EMPTY_ARRAY)",
            str(
                lit('{}').json_query('$.x', JsonQueryWrapper.WITHOUT_ARRAY,
                                     JsonQueryOnEmptyOrError.NULL,
                                     JsonQueryOnEmptyOrError.EMPTY_ARRAY)))
Esempio n. 24
0
def log_processing():
    env_settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    t_env = StreamTableEnvironment.create(environment_settings=env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar"
    )

    source_ddl = """
            CREATE TABLE source_table(
                token VARCHAR,
                stime BIGINT,
                appKey VARCHAR,
                user_action_time AS PROCTIME()
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'properties.group.id' = 'test_3',
              'scan.startup.mode' = 'earliest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                token VARCHAR,
                appKey VARCHAR,
                stime TIMESTAMP(3) NOT NULL,
                nums BIGINT NOT NULL
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic1',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)
    query_sql = """
        SELECT 
          token,
          appKey,
          TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, 
          COUNT(token) as nums 
        FROM source_table 
        WHERE appKey = 'YSHAppAndroidIOSH5'
        GROUP BY 
          token,
          appKey,
          TUMBLE(user_action_time, INTERVAL '5' MINUTE)
    """
    # t_env.sql_query(query_sql) \
    #     .execute_insert("sink_table").wait()
    source_t = t_env.from_path("source_table")
    result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \
      .window(Slide.over(lit(1).days) \
        .every(lit(1).minutes) \
        .on(source_t.user_action_time).alias("w")) \
        .group_by(source_t.token, source_t.appKey, col("w")) \
          .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums"))

    result.execute_insert("sink_table").wait()
Esempio n. 25
0
    def test_sliding_group_window_over_time(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Slide
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")
        sink_table_ddl = """
            CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d FLOAT)
            WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Slide.over(lit(1).hours)
                 .every(lit(30).minutes)
                 .on(col("rowtime"))
                 .alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, mean_udaf(t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 2018-03-11T02:30, 2018-03-11T03:30, 2.0]",
                            "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2.5]",
                            "+I[1, 2018-03-11T03:30, 2018-03-11T04:30, 5.5]",
                            "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 8.0]",
                            "+I[2, 2018-03-11T02:30, 2018-03-11T03:30, 1.0]",
                            "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2.0]",
                            "+I[2, 2018-03-11T03:30, 2018-03-11T04:30, 3.0]",
                            "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 2.0]",
                            "+I[3, 2018-03-11T02:30, 2018-03-11T03:30, 2.0]"])
        os.remove(source_path)
Esempio n. 26
0
import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                         ['a', 'b', 'c'])

st_env.create_temporary_table(
    "csv_sink",
    TableDescriptor.for_connector("filesystem").schema(
        Schema.new_builder().column(
            "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column(
                "c",
                DataTypes.STRING()).build()).option("path", sink_path).format(
                    FormatDescriptor.for_format("csv").option(
                        "field-delimiter", ",").build()).build())

t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')