コード例 #1
0
def demo01():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \
        .with_format(OldCsv()
                     .field('word', DataTypes.STRING())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())) \
        .create_temporary_table('mySource')
    # 文件存在会报错
    t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \
        .with_format(OldCsv()
                     .field_delimiter('\t')
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .with_schema(Schema()
                     .field('word', DataTypes.STRING())
                     .field('count', DataTypes.BIGINT())) \
        .create_temporary_table('mySink')

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
コード例 #2
0
def demo02():
    exec_env = ExecutionEnvironment.get_execution_environment()
    exec_env.set_parallelism(1)
    t_config = TableConfig()
    t_env = BatchTableEnvironment.create(exec_env, t_config)
    # StreamExecutionEnvironment

    my_source_ddl = """
        create table mySource (
            word VARCHAR
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input'
        )
    """

    my_sink_ddl = """
        create table mySink (
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output'
        )
    """

    t_env.execute_sql(my_source_ddl)
    t_env.execute_sql(my_sink_ddl)

    tab = t_env.from_path('mySource')
    tab.group_by(tab.word) \
        .select(tab.word, lit(1).count) \
        .execute_insert('mySink').wait()
コード例 #3
0
ファイル: test_pandas_udaf.py プロジェクト: xinsmile/flink
    def test_tumble_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))
        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b TIMESTAMP(3), c FLOAT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        tumble_window = Tumble.over(lit(1).hours) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(tumble_window) \
            .group_by(col("w")) \
            .select(col("w").start, col("w").end, mean_udaf(t.b)) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11T03:00, 2018-03-11T04:00, 2.2]",
                            "+I[2018-03-11T04:00, 2018-03-11T05:00, 8.0]"])
コード例 #4
0
    def test_tumbling_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:30:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
        ]
        source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.create_temporary_system_function(
            "my_count", CountDistinctAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c INT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT, e BIGINT)
        WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \
            .group_by(t.a, col("w")) \
            .select(t.a,
                    col("w").start,
                    col("w").end,
                    t.c.count.alias("c"),
                    call("my_count", t.c).alias("d")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2, 1]",
            "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 1, 1]",
            "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2, 2]",
            "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 1, 1]"
        ])
コード例 #5
0
ファイル: test_window.py プロジェクト: Eureka1996/flink
    def test_session_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"])
        result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\
            .group_by(expr.col('w'), expr.col('c')).select(t.b.sum)

        query_operation = result._j_table.getQueryOperation().getChildren().get(0)
        self.assertEqual('[c]', query_operation.getGroupingExpressions().toString())
        self.assertEqual('SessionWindow(field: [a], gap: [1000])',
                         query_operation.getGroupWindow().asSummaryString())
コード例 #6
0
    def test_session_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_session_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_count", CountAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]",
                            "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]",
                            "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]",
                            "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
コード例 #7
0
ファイル: test_udf.py プロジェクト: shawkins/flink
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t = t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1))

        result = self.collect(t)
        self.assertEqual(result, ["3,1,1", "7,2,1", "4,3,1"])
コード例 #8
0
ファイル: table_operator.py プロジェクト: wupengbo125/penter
def demo01():
    # environment configuration
    t_env = BatchTableEnvironment.create(environment_settings=EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build())

    # register Orders table and Result table sink in table environment
    source_data_path = "/path/to/source/directory/"
    result_data_path = "/path/to/result/directory/"
    source_ddl = f"""
            create table Orders(
                a VARCHAR,
                b BIGINT,
                c BIGINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '1' SECOND
            ) with (
                'connector' = 'filesystem',
                'format' = 'csv',
                'path' = '{source_data_path}'
            )
            """
    t_env.execute_sql(source_ddl)

    sink_ddl = f"""
        create table `Result`(
            a VARCHAR,
            cnt BIGINT
        ) with (
            'connector' = 'filesystem',
            'format' = 'csv',
            'path' = '{result_data_path}'
        )
        """
    t_env.execute_sql(sink_ddl)

    # specify table program
    orders = t_env.from_path("Orders")  # schema (a, b, c, rowtime)

    orders.group_by("a").select(orders.a, orders.b.count.alias('cnt')).execute_insert("result").wait()

    orders.where(orders.a == 'red')
    orders.filter(orders.b % 2 == 0)
    orders.add_columns(concat(orders.c, 'sunny'))
    orders.add_or_replace_columns(concat(orders.c, 'sunny').alias('desc'))
    orders.drop_columns(orders.b, orders.c)
    orders.rename_columns(orders.b.alias('b2'), orders.c.alias('c2'))
    orders.group_by(orders.a).select(orders.a, orders.b.sum.alias('d'))

    # tab.group_by(tab.key).select(tab.key, tab.value.avg.alias('average'))
    # tab.group_by("key").select("key, value.avg as average")
    result = orders.filter(orders.a.is_not_null & orders.b.is_not_null & orders.c.is_not_null) \
        .select(orders.a.lower_case.alias('a'), orders.b, orders.rowtime) \
        .window(Tumble.over(lit(1).hour).on(orders.rowtime).alias("hourly_window")) \
        .group_by(col('hourly_window'), col('a')) \
        .select(col('a'), col('hourly_window').end.alias('hour'), col('b').avg.alias('avg_billing_amount'))
    """
コード例 #9
0
    def test_expressions(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')

        self.assertEqual('10', str(lit(10, DataTypes.INT(False))))
        self.assertEqual('rangeTo(1, 2)', str(range_(1, 2)))
        self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3)))
        self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3)))

        from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \
            CURRENT_RANGE
        self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW))
        self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE))
        self.assertEqual('currentRow()', str(CURRENT_ROW))
        self.assertEqual('currentRange()', str(CURRENT_RANGE))

        self.assertEqual('currentDate()', str(current_date()))
        self.assertEqual('currentTime()', str(current_time()))
        self.assertEqual('currentTimestamp()', str(current_timestamp()))
        self.assertEqual('localTime()', str(local_time()))
        self.assertEqual('localTimestamp()', str(local_timestamp()))
        self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0)))
        self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, "
                         "cast('3:30:00', TIME(0)), 7200000)",
                         str(temporal_overlaps(
                             lit("2:55:00").to_time,
                             lit(1).hours,
                             lit("3:30:00").to_time,
                             lit(2).hours)))
        self.assertEqual("dateFormat(time, '%Y, %d %M')",
                         str(date_format(col("time"), "%Y, %d %M")))
        self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))",
                         str(timestamp_diff(
                             TimePointUnit.DAY,
                             lit("2016-06-15").to_date,
                             lit("2016-06-18").to_date)))
        self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3)))
        self.assertEqual("row('key1', 1)", str(row("key1", 1)))
        self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)",
                         str(map_("key1", 1, "key2", 2, "key3", 3)))
        self.assertEqual('4', str(row_interval(4)))
        self.assertEqual('pi()', str(pi()))
        self.assertEqual('e()', str(e()))
        self.assertEqual('rand(4)', str(rand(4)))
        self.assertEqual('randInteger(4)', str(rand_integer(4)))
        self.assertEqual('atan2(1, 2)', str(atan2(1, 2)))
        self.assertEqual('minusPrefix(a)', str(negative(expr1)))
        self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3)))
        self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3)))
        self.assertEqual('uuid()', str(uuid()))
        self.assertEqual('null', str(null_of(DataTypes.BIGINT())))
        self.assertEqual('log(a)', str(log(expr1)))
        self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3)))
        self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3)))
        self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
コード例 #10
0
ファイル: pandas_udaf.py プロジェクト: Eureka1996/flink
def pandas_udaf():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(5000), 'Alice', 13.1),
            (Instant.of_epoch_milli(3000), 'Bob', 3.1),
            (Instant.of_epoch_milli(7000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    @udaf(result_type=DataTypes.FLOAT(), func_type="pandas")
    def mean_udaf(v):
        return v.mean()

    # define the tumble window operation
    table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, mean_udaf(table.price), col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
コード例 #11
0
def word_count(input_path, output_path):
    t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
    # write all the data to one file
    t_env.get_config().get_configuration().set_string("parallelism.default",
                                                      "1")

    # define the source
    if input_path is not None:
        t_env.create_temporary_table(
            'source',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column(
                    'word', DataTypes.STRING()).build()).option(
                        'path', input_path).format('csv').build())
        tab = t_env.from_path('source')
    else:
        print("Executing word_count example with default input data set.")
        print("Use --input to specify file input.")
        tab = t_env.from_elements(
            map(lambda i: (i, ), word_count_data),
            DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())]))

    # define the sink
    if output_path is not None:
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('filesystem').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count',
                    DataTypes.BIGINT()).build()).option('path', output_path).
            format(FormatDescriptor.for_format('canal-json').build()).build())
    else:
        print(
            "Printing result to stdout. Use --output to specify output path.")
        t_env.create_temporary_table(
            'sink',
            TableDescriptor.for_connector('print').schema(
                Schema.new_builder().column('word', DataTypes.STRING()).column(
                    'count', DataTypes.BIGINT()).build()).build())

    @udtf(result_types=[DataTypes.STRING()])
    def split(line: Row):
        for s in line[0].split():
            yield Row(s)

    # compute word count
    tab.flat_map(split).alias('word') \
       .group_by(col('word')) \
       .select(col('word'), lit(1).count) \
       .execute_insert('sink') \
       .wait()
コード例 #12
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
コード例 #13
0
    def test_left_outer_join_lateral_with_join_predicate(self):
        t_env = self.t_env
        t_env.create_java_temporary_system_function("split",
                                                    "org.apache.flink.table.utils.TableFunc1")
        source = t_env.from_elements([("1", "1#3#5#7"), ("2", "2#4#6#8")], ["id", "words"])

        # only support "true" as the join predicate currently
        result = source.left_outer_join_lateral(expr.call('split', source.words).alias('word'),
                                                expr.lit(True))

        query_operation = result._j_table.getQueryOperation()
        self.assertEqual('LEFT_OUTER', query_operation.getJoinType().toString())
        self.assertTrue(query_operation.isCorrelated())
        self.assertEqual('true', query_operation.getCondition().toString())
コード例 #14
0
ファイル: word_count.py プロジェクト: wupengbo125/penter
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    t_config = TableConfig()
    env = ExecutionEnvironment.get_execution_environment()
    #t_config.set_python_executable("/opt/python38/bin/python3")
    # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3
    t_env = BatchTableEnvironment.create(env, t_config)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename,
                          e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])


    table.group_by(table.word) \
        .select(table.word, expr.lit(1).count.alias('count')) \
        .execute_insert("Results").wait()
コード例 #15
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        sink_table_ddl = """
                CREATE TABLE Results(a BIGINT, b BIGINT, c INT) WITH ('connector'='test-sink')
                """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
コード例 #16
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by("w") \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select("w.rowtime, d, e") \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["2018-03-11 03:59:59.999,2.2,3",
                            "2018-03-11 04:59:59.999,8.0,8"])
コード例 #17
0
def word_count():
    content = "line Licensed to the Apache Software Foundation ASF under one " \
              "line or more contributor license agreements See the NOTICE file " \
              "line distributed with this work for additional information " \
              "line regarding copyright ownership The ASF licenses this file " \
              "to you under the Apache License Version the " \
              "License you may not use this file except in compliance " \
              "with the License"

    env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
    t_env = BatchTableEnvironment.create(environment_settings=env_settings)

    # register Results table in table environment
    tmp_dir = tempfile.gettempdir()
    result_path = tmp_dir + '/result'
    if os.path.exists(result_path):
        try:
            if os.path.isfile(result_path):
                os.remove(result_path)
            else:
                shutil.rmtree(result_path)
        except OSError as e:
            logging.error("Error removing directory: %s - %s.", e.filename, e.strerror)

    logging.info("Results directory: %s", result_path)

    sink_ddl = """
        create table Results(
            word VARCHAR,
            `count` BIGINT
        ) with (
            'connector.type' = 'filesystem',
            'format.type' = 'csv',
            'connector.path' = '{}'
        )
        """.format(result_path)
    t_env.execute_sql(sink_ddl)

    elements = [(word, 1) for word in content.split(" ")]
    table = t_env.from_elements(elements, ["word", "count"])
    table.group_by(table.word) \
         .select(table.word, expr.lit(1).count.alias('count')) \
         .insert_into("Results")

    t_env.execute("word_count")
コード例 #18
0
def main():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.add_jars("file:///app/src/kafka-clients-2.8.0.jar")
    env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar")
    env.set_parallelism(1)
    env.set_stream_time_characteristic(TimeCharacteristic.EventTime)

    env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE)
    config = env.get_checkpoint_config()
    config.enable_externalized_checkpoints(
        ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION)

    st_env = StreamTableEnvironment.create(
        env,
        environment_settings=EnvironmentSettings.new_instance(
        ).in_streaming_mode().use_blink_planner().build())

    print("register kafka source")
    register_kafka_source(st_env)
    print("register transaction sinks")
    register_transactions_sink_into_csv(st_env)


    st_env.from_path("source_tbl") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("total_sink")

    st_env.from_path("source_tbl") \
       .where("message = 'dolorem'") \
       .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
       .group_by(col("w")) \
       .select("""
                   count(message) as total,
                   w.end as end_time
                  """) \
       .insert_into("grep_sink")

    st_env.from_path("source_tbl") \
        .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \
        .group_by(col("w"), col("message")) \
        .select("""
                    count(message) as total,
                    message,
                    w.end as end_time
                   """) \
        .insert_into("topk_sink")

    st_env.execute("app")
コード例 #19
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        sink_table_ddl = """
        CREATE TABLE Results(a TIMESTAMP(3), b FLOAT, c INT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        print(t.get_schema())
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by(expr.col("w")) \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select(expr.col("w").rowtime, expr.col("d"), expr.col("e")) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11 03:59:59.999, 2.2, 3]",
                            "+I[2018-03-11 04:59:59.999, 8.0, 8]"])
コード例 #20
0
ファイル: test_pandas_udaf.py プロジェクト: zoltar9264/flink
    def test_tumble_group_window_aggregate_function(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        self.t_env.create_temporary_system_function("mean_udaf", mean_udaf)
        tumble_window = Tumble.over(lit(1).hours) \
            .on(col("rowtime")) \
            .alias("w")
        t.window(tumble_window) \
            .group_by(col("w")) \
            .select(col("w").start, col("w").end, mean_udaf(t.b)) \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.2]",
                            "+I[2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0]"])
コード例 #21
0
ファイル: test_pandas_udaf.py プロジェクト: zoltar9264/flink
    def test_tumbling_group_window_over_time(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Tumble
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a,
                    col("w").start,
                    col("w").end,
                    col("w").rowtime,
                    mean_udaf(t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.5]",
            "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 2018-03-11 04:59:59.999, 8.0]",
            "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.0]",
            "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2018-03-11 03:59:59.999, 2.0]",
        ])
        os.remove(source_path)
コード例 #22
0
ファイル: example01.py プロジェクト: summer-vacation/AlgoExec
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.expressions import lit

exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)

t_env.connect(FileSystem().path('/tmp/input')) \
    .with_format(OldCsv()
                 .field('word', DataTypes.STRING())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())) \
    .create_temporary_table('mySource')

t_env.connect(FileSystem().path('/tmp/output')) \
    .with_format(OldCsv()
                 .field_delimiter('\t')
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .with_schema(Schema()
                 .field('word', DataTypes.STRING())
                 .field('count', DataTypes.BIGINT())) \
    .create_temporary_table('mySink')

tab = t_env.from_path('mySource')
tab.group_by(tab.word) \
   .select(tab.word, lit(1).count) \
   .execute_insert('mySink').wait()
コード例 #23
0
ファイル: test_expression.py プロジェクト: zhining-lu/flink
    def test_expression(self):
        expr1 = col('a')
        expr2 = col('b')
        expr3 = col('c')
        expr4 = col('d')
        expr5 = lit(10)

        # comparison functions
        self.assertEqual('equals(a, b)', str(expr1 == expr2))
        self.assertEqual('mod(2, b)', str(2 % expr2))
        self.assertEqual('notEquals(a, b)', str(expr1 != expr2))
        self.assertEqual('lessThan(a, b)', str(expr1 < expr2))
        self.assertEqual('lessThanOrEqual(a, b)', str(expr1 <= expr2))
        self.assertEqual('greaterThan(a, b)', str(expr1 > expr2))
        self.assertEqual('greaterThanOrEqual(a, b)', str(expr1 >= expr2))

        # logic functions
        self.assertEqual('and(a, b)', str(expr1 & expr2))
        self.assertEqual('or(a, b)', str(expr1 | expr2))
        self.assertEqual('isNotTrue(a)', str(expr1.is_not_true))
        self.assertEqual('isNotTrue(a)', str(~expr1))

        # arithmetic functions
        self.assertEqual('plus(a, b)', str(expr1 + expr2))
        self.assertEqual('plus(2, b)', str(2 + expr2))
        self.assertEqual('plus(cast(b, DATE), 2)', str(expr2.to_date + 2))
        self.assertEqual('minus(a, b)', str(expr1 - expr2))
        self.assertEqual('minus(cast(b, DATE), 2)', str(expr2.to_date - 2))
        self.assertEqual('times(a, b)', str(expr1 * expr2))
        self.assertEqual('divide(a, b)', str(expr1 / expr2))
        self.assertEqual('mod(a, b)', str(expr1 % expr2))
        self.assertEqual('power(a, b)', str(expr1**expr2))
        self.assertEqual('minusPrefix(a)', str(-expr1))

        self.assertEqual('exp(a)', str(expr1.exp))
        self.assertEqual('log10(a)', str(expr1.log10))
        self.assertEqual('log2(a)', str(expr1.log2))
        self.assertEqual('ln(a)', str(expr1.ln))
        self.assertEqual('log(a)', str(expr1.log()))
        self.assertEqual('cosh(a)', str(expr1.cosh))
        self.assertEqual('sinh(a)', str(expr1.sinh))
        self.assertEqual('sin(a)', str(expr1.sin))
        self.assertEqual('cos(a)', str(expr1.cos))
        self.assertEqual('tan(a)', str(expr1.tan))
        self.assertEqual('cot(a)', str(expr1.cot))
        self.assertEqual('asin(a)', str(expr1.asin))
        self.assertEqual('acos(a)', str(expr1.acos))
        self.assertEqual('atan(a)', str(expr1.atan))
        self.assertEqual('tanh(a)', str(expr1.tanh))
        self.assertEqual('degrees(a)', str(expr1.degrees))
        self.assertEqual('radians(a)', str(expr1.radians))
        self.assertEqual('sqrt(a)', str(expr1.sqrt))
        self.assertEqual('abs(a)', str(expr1.abs))
        self.assertEqual('abs(a)', str(abs(expr1)))
        self.assertEqual('sign(a)', str(expr1.sign))
        self.assertEqual('round(a, b)', str(expr1.round(expr2)))
        self.assertEqual('between(a, b, c)', str(expr1.between(expr2, expr3)))
        self.assertEqual('notBetween(a, b, c)',
                         str(expr1.not_between(expr2, expr3)))
        self.assertEqual('ifThenElse(a, b, c)', str(expr1.then(expr2, expr3)))

        self.assertEqual('isNull(a)', str(expr1.is_null))
        self.assertEqual('isNotNull(a)', str(expr1.is_not_null))
        self.assertEqual('isTrue(a)', str(expr1.is_true))
        self.assertEqual('isFalse(a)', str(expr1.is_false))
        self.assertEqual('isNotTrue(a)', str(expr1.is_not_true))
        self.assertEqual('isNotFalse(a)', str(expr1.is_not_false))
        self.assertEqual('distinct(a)', str(expr1.distinct))
        self.assertEqual('sum(a)', str(expr1.sum))
        self.assertEqual('sum0(a)', str(expr1.sum0))
        self.assertEqual('min(a)', str(expr1.min))
        self.assertEqual('max(a)', str(expr1.max))
        self.assertEqual('count(a)', str(expr1.count))
        self.assertEqual('avg(a)', str(expr1.avg))
        self.assertEqual('first_value(a)', str(expr1.first_value))
        self.assertEqual('last_value(a)', str(expr1.last_value))
        self.assertEqual('stddevPop(a)', str(expr1.stddev_pop))
        self.assertEqual('stddevSamp(a)', str(expr1.stddev_samp))
        self.assertEqual('varPop(a)', str(expr1.var_pop))
        self.assertEqual('varSamp(a)', str(expr1.var_samp))
        self.assertEqual('collect(a)', str(expr1.collect))
        self.assertEqual("as(a, 'a', 'b', 'c')",
                         str(expr1.alias('a', 'b', 'c')))
        self.assertEqual('cast(a, INT)', str(expr1.cast(DataTypes.INT())))
        self.assertEqual('asc(a)', str(expr1.asc))
        self.assertEqual('desc(a)', str(expr1.desc))
        self.assertEqual('in(a, b, c, d)', str(expr1.in_(expr2, expr3, expr4)))
        self.assertEqual('start(a)', str(expr1.start))
        self.assertEqual('end(a)', str(expr1.end))
        self.assertEqual('bin(a)', str(expr1.bin))
        self.assertEqual('hex(a)', str(expr1.hex))
        self.assertEqual('truncate(a, 3)', str(expr1.truncate(3)))

        # string functions
        self.assertEqual('substring(a, b, 3)', str(expr1.substring(expr2, 3)))
        self.assertEqual("trim(true, false, ' ', a)",
                         str(expr1.trim_leading()))
        self.assertEqual("trim(false, true, ' ', a)",
                         str(expr1.trim_trailing()))
        self.assertEqual("trim(true, true, ' ', a)", str(expr1.trim()))
        self.assertEqual('replace(a, b, c)', str(expr1.replace(expr2, expr3)))
        self.assertEqual('charLength(a)', str(expr1.char_length))
        self.assertEqual('upper(a)', str(expr1.upper_case))
        self.assertEqual('lower(a)', str(expr1.lower_case))
        self.assertEqual('initCap(a)', str(expr1.init_cap))
        self.assertEqual("like(a, 'Jo_n%')", str(expr1.like('Jo_n%')))
        self.assertEqual("similar(a, 'A+')", str(expr1.similar('A+')))
        self.assertEqual('position(a, b)', str(expr1.position(expr2)))
        self.assertEqual('lpad(a, 4, b)', str(expr1.lpad(4, expr2)))
        self.assertEqual('rpad(a, 4, b)', str(expr1.rpad(4, expr2)))
        self.assertEqual('overlay(a, b, 6, 2)', str(expr1.overlay(expr2, 6,
                                                                  2)))
        self.assertEqual("regexpReplace(a, b, 'abc')",
                         str(expr1.regexp_replace(expr2, 'abc')))
        self.assertEqual('regexpExtract(a, b, 3)',
                         str(expr1.regexp_extract(expr2, 3)))
        self.assertEqual('fromBase64(a)', str(expr1.from_base64))
        self.assertEqual('toBase64(a)', str(expr1.to_base64))
        self.assertEqual('ltrim(a)', str(expr1.ltrim))
        self.assertEqual('rtrim(a)', str(expr1.rtrim))
        self.assertEqual('repeat(a, 3)', str(expr1.repeat(3)))
        self.assertEqual("over(a, 'w')", str(expr1.over('w')))

        # temporal functions
        self.assertEqual('cast(a, DATE)', str(expr1.to_date))
        self.assertEqual('cast(a, TIME(0))', str(expr1.to_time))
        self.assertEqual('cast(a, TIMESTAMP(3))', str(expr1.to_timestamp))
        self.assertEqual('extract(YEAR, a)',
                         str(expr1.extract(TimeIntervalUnit.YEAR)))
        self.assertEqual('floor(a, YEAR)',
                         str(expr1.floor(TimeIntervalUnit.YEAR)))
        self.assertEqual('ceil(a)', str(expr1.ceil()))

        # advanced type helper functions
        self.assertEqual("get(a, 'col')", str(expr1.get('col')))
        self.assertEqual('flatten(a)', str(expr1.flatten))
        self.assertEqual('at(a, 0)', str(expr1.at(0)))
        self.assertEqual('cardinality(a)', str(expr1.cardinality))
        self.assertEqual('element(a)', str(expr1.element))

        # time definition functions
        self.assertEqual('rowtime(a)', str(expr1.rowtime))
        self.assertEqual('proctime(a)', str(expr1.proctime))
        self.assertEqual('120', str(expr5.year))
        self.assertEqual('120', str(expr5.years))
        self.assertEqual('30', str(expr5.quarter))
        self.assertEqual('30', str(expr5.quarters))
        self.assertEqual('10', str(expr5.month))
        self.assertEqual('10', str(expr5.months))
        self.assertEqual('6048000000', str(expr5.week))
        self.assertEqual('6048000000', str(expr5.weeks))
        self.assertEqual('864000000', str(expr5.day))
        self.assertEqual('864000000', str(expr5.days))
        self.assertEqual('36000000', str(expr5.hour))
        self.assertEqual('36000000', str(expr5.hours))
        self.assertEqual('600000', str(expr5.minute))
        self.assertEqual('600000', str(expr5.minutes))
        self.assertEqual('10000', str(expr5.second))
        self.assertEqual('10000', str(expr5.seconds))
        self.assertEqual('10', str(expr5.milli))
        self.assertEqual('10', str(expr5.millis))

        # hash functions
        self.assertEqual('md5(a)', str(expr1.md5))
        self.assertEqual('sha1(a)', str(expr1.sha1))
        self.assertEqual('sha224(a)', str(expr1.sha224))
        self.assertEqual('sha256(a)', str(expr1.sha256))
        self.assertEqual('sha384(a)', str(expr1.sha384))
        self.assertEqual('sha512(a)', str(expr1.sha512))
        self.assertEqual('sha2(a, 224)', str(expr1.sha2(224)))

        # json functions
        self.assertEqual("IS_JSON('42')", str(lit('42').is_json()))
        self.assertEqual("IS_JSON('42', SCALAR)",
                         str(lit('42').is_json(JsonType.SCALAR)))

        self.assertEqual("JSON_EXISTS('{}', '$.x')",
                         str(lit('{}').json_exists('$.x')))
        self.assertEqual(
            "JSON_EXISTS('{}', '$.x', FALSE)",
            str(lit('{}').json_exists('$.x', JsonExistsOnError.FALSE)))

        self.assertEqual(
            "JSON_VALUE('{}', '$.x', STRING, NULL, null, NULL, null)",
            str(lit('{}').json_value('$.x')))
        self.assertEqual(
            "JSON_VALUE('{}', '$.x', INT, DEFAULT, 42, ERROR, null)",
            str(
                lit('{}').json_value('$.x', DataTypes.INT(),
                                     JsonValueOnEmptyOrError.DEFAULT, 42,
                                     JsonValueOnEmptyOrError.ERROR, None)))

        self.assertEqual(
            "JSON_QUERY('{}', '$.x', WITHOUT_ARRAY, NULL, EMPTY_ARRAY)",
            str(
                lit('{}').json_query('$.x', JsonQueryWrapper.WITHOUT_ARRAY,
                                     JsonQueryOnEmptyOrError.NULL,
                                     JsonQueryOnEmptyOrError.EMPTY_ARRAY)))
コード例 #24
0
ファイル: Example.py プロジェクト: sdivens/blog
def log_processing():
    env_settings = EnvironmentSettings.new_instance().in_streaming_mode(
    ).use_blink_planner().build()
    t_env = StreamTableEnvironment.create(environment_settings=env_settings)
    # specify connector and format jars
    t_env.get_config().get_configuration().set_string(
        "pipeline.jars",
        "file:///Users/liuhongwei/.m2/repository/org/apache/flink/flink-connector-kafka_2.11/1.12.0/flink-connector-kafka_2.11-1.12.0.jar;file:///Users/liuhongwei/.m2/repository/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar;file:///Users/liuhongwei/.m2/repository/org/apache/kafka/kafka-clients/2.4.1/kafka-clients-2.4.1.jar"
    )

    source_ddl = """
            CREATE TABLE source_table(
                token VARCHAR,
                stime BIGINT,
                appKey VARCHAR,
                user_action_time AS PROCTIME()
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'properties.group.id' = 'test_3',
              'scan.startup.mode' = 'earliest-offset',
              'format' = 'json'
            )
            """

    sink_ddl = """
            CREATE TABLE sink_table(
                token VARCHAR,
                appKey VARCHAR,
                stime TIMESTAMP(3) NOT NULL,
                nums BIGINT NOT NULL
            ) WITH (
              'connector' = 'kafka',
              'topic' = 'markTopic1',
              'properties.bootstrap.servers' = 'slavenode164.data.test.ds:9092,slavenode165.data.test.ds:9092,slavenode166.data.test.ds:9092',
              'format' = 'json'
            )
            """

    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)
    query_sql = """
        SELECT 
          token,
          appKey,
          TUMBLE_START(user_action_time, INTERVAL '5' MINUTE) as stime, 
          COUNT(token) as nums 
        FROM source_table 
        WHERE appKey = 'YSHAppAndroidIOSH5'
        GROUP BY 
          token,
          appKey,
          TUMBLE(user_action_time, INTERVAL '5' MINUTE)
    """
    # t_env.sql_query(query_sql) \
    #     .execute_insert("sink_table").wait()
    source_t = t_env.from_path("source_table")
    result = source_t.filter(source_t.appKey == "YSHAppAndroidIOSH5") \
      .window(Slide.over(lit(1).days) \
        .every(lit(1).minutes) \
        .on(source_t.user_action_time).alias("w")) \
        .group_by(source_t.token, source_t.appKey, col("w")) \
          .select(source_t.token, source_t.appKey, col("w").start.alias("stime"), source_t.token.count.alias("nums"))

    result.execute_insert("sink_table").wait()
コード例 #25
0
ファイル: test_pandas_udaf.py プロジェクト: xinsmile/flink
    def test_sliding_group_window_over_time(self):
        # create source file path
        import tempfile
        import os
        tmp_dir = tempfile.gettempdir()
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        from pyflink.table.window import Slide
        self.t_env.get_config().set(
            "pipeline.time-characteristic", "EventTime")
        self.t_env.register_function("mean_udaf", mean_udaf)

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")
        sink_table_ddl = """
            CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d FLOAT)
            WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Slide.over(lit(1).hours)
                 .every(lit(30).minutes)
                 .on(col("rowtime"))
                 .alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, mean_udaf(t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 2018-03-11T02:30, 2018-03-11T03:30, 2.0]",
                            "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2.5]",
                            "+I[1, 2018-03-11T03:30, 2018-03-11T04:30, 5.5]",
                            "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 8.0]",
                            "+I[2, 2018-03-11T02:30, 2018-03-11T03:30, 1.0]",
                            "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2.0]",
                            "+I[2, 2018-03-11T03:30, 2018-03-11T04:30, 3.0]",
                            "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 2.0]",
                            "+I[3, 2018-03-11T02:30, 2018-03-11T03:30, 2.0]"])
        os.remove(source_path)
コード例 #26
0
import os
import shutil

sink_path = tempfile.gettempdir() + '/batch.csv'
if os.path.exists(sink_path):
    if os.path.isfile(sink_path):
        os.remove(sink_path)
    else:
        shutil.rmtree(sink_path)
s_env.set_parallelism(1)
t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                         ['a', 'b', 'c'])

st_env.create_temporary_table(
    "csv_sink",
    TableDescriptor.for_connector("filesystem").schema(
        Schema.new_builder().column(
            "a", DataTypes.BIGINT()).column("b", DataTypes.STRING()).column(
                "c",
                DataTypes.STRING()).build()).option("path", sink_path).format(
                    FormatDescriptor.for_format("csv").option(
                        "field-delimiter", ",").build()).build())

t.select(t.a + lit(1), t.b, t.c).execute_insert("csv_sink").wait()

with open(os.path.join(sink_path, os.listdir(sink_path)[0]), 'r') as f:
    lines = f.read()
    assert lines == '2,hi,hello\n' + '3,hi,hello\n'

print('pip_test_code.py success!')