Ejemplo n.º 1
0
    def test_session_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello")], ["a", "b", "c"])
        result = t.window(Session.with_gap(expr.lit(1).seconds).on("a").alias("w"))\
            .group_by(expr.col('w'), expr.col('c')).select(t.b.sum)

        query_operation = result._j_table.getQueryOperation().getChildren().get(0)
        self.assertEqual('[c]', query_operation.getGroupingExpressions().toString())
        self.assertEqual('SessionWindow(field: [a], gap: [1000])',
                         query_operation.getGroupWindow().asSummaryString())
Ejemplo n.º 2
0
    def test_session_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00',
            '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00',
            '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00',
            '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_session_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_count", CountAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [
                DataTypes.TINYINT(),
                DataTypes.TIMESTAMP(3),
                DataTypes.TIMESTAMP(3),
                DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Session.with_gap("30.minutes").on("rowtime").alias("w")) \
            .group_by("a, b, w") \
            .select("a, w.start, w.end, my_count(c) as c") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]",
                            "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]",
                            "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]",
                            "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
Ejemplo n.º 3
0
    def test_session_window(self):
        t = self.t_env.from_elements([(1000, 1, "Hello"), (2000, 2, "Hello"),
                                      (4000, 4, "Hello"), (5000, 8, "Hello")],
                                     ["a", "b", "c"])

        result = t.window(Session.with_gap("1.seconds").on("a").alias("w"))\
            .group_by("w, c").select("b.sum")
        actual = self.collect(result)

        expected = ['3', '12']
        self.assert_equals(actual, expected)
Ejemplo n.º 4
0
    def test_session_group_window_over_time(self):
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_session_group_window_over_time.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_count", CountAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                rowtime TIMESTAMP(3),
                WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT)
        WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \
            .group_by(t.a, t.b, col("w")) \
            .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "+I[3, 2018-03-11T03:10, 2018-03-11T03:40, 1]",
            "+I[2, 2018-03-11T03:10, 2018-03-11T04:00, 2]",
            "+I[1, 2018-03-11T03:10, 2018-03-11T04:10, 2]",
            "+I[1, 2018-03-11T04:20, 2018-03-11T04:50, 1]"
        ])
Ejemplo n.º 5
0
    def test_session_window(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.LONG, DataTypes.INT, DataTypes.STRING]
        data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (4000, 4, "Hello"), (5000, 8, "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")

        result = source.window(Session.with_gap("1.seconds").on("a").alias("w"))\
            .group_by("w, c").select("b.sum")
        actual = self.collect(result)

        expected = ['3', '12']
        self.assert_equals(actual, expected)
Ejemplo n.º 6
0
    def test_session_window(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.INT(), DataTypes.STRING()]
        data = [(1000, 1, "Hello"), (2000, 2, "Hello"), (4000, 4, "Hello"), (5000, 8, "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")

        result = source.window(Session.with_gap("1.seconds").on("a").alias("w"))\
            .group_by("w, c").select("b.sum")
        actual = self.collect(result)

        expected = ['3', '12']
        self.assert_equals(actual, expected)
Ejemplo n.º 7
0
def session_window_demo():
    env = StreamExecutionEnvironment.get_execution_environment()
    env.set_parallelism(1)
    t_env = StreamTableEnvironment.create(stream_execution_environment=env)

    # define the source with watermark definition
    ds = env.from_collection(
        collection=[
            (Instant.of_epoch_milli(1000), 'Alice', 110.1),
            (Instant.of_epoch_milli(4000), 'Bob', 30.2),
            (Instant.of_epoch_milli(3000), 'Alice', 20.0),
            (Instant.of_epoch_milli(2000), 'Bob', 53.1),
            (Instant.of_epoch_milli(8000), 'Bob', 16.1),
            (Instant.of_epoch_milli(10000), 'Alice', 20.1)
        ],
        type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()]))

    table = t_env.from_data_stream(
        ds,
        Schema.new_builder()
              .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))")
              .column("f1", DataTypes.STRING())
              .column("f2", DataTypes.FLOAT())
              .watermark("ts", "ts - INTERVAL '3' SECOND")
              .build()
    ).alias("ts, name, price")

    # define the sink
    t_env.create_temporary_table(
        'sink',
        TableDescriptor.for_connector('print')
                       .schema(Schema.new_builder()
                               .column('name', DataTypes.STRING())
                               .column('total_price', DataTypes.FLOAT())
                               .column('w_start', DataTypes.TIMESTAMP_LTZ())
                               .column('w_end', DataTypes.TIMESTAMP_LTZ())
                               .build())
                       .build())

    # define the session window operation
    table = table.window(Session.with_gap(lit(5).seconds).on(col("ts")).alias("w")) \
                 .group_by(table.name, col('w')) \
                 .select(table.name, table.price.sum, col("w").start, col("w").end)

    # submit for execution
    table.execute_insert('sink') \
         .wait()
Ejemplo n.º 8
0
def session_time_window_batch():
    b_env = ExecutionEnvironment.get_execution_environment()
    b_env.set_parallelism(1)
    bt_env = BatchTableEnvironment.create(b_env)
    source_file = os.getcwd() + "/../resources/table_orders.csv"
    result_file = "/tmp/table_session_time_window_batch.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    bt_env.register_table_source(
        "Orders",
        CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [
            DataTypes.STRING(),
            DataTypes.INT(),
            DataTypes.INT(),
            DataTypes.TIMESTAMP()
        ]))
    bt_env.register_table_sink(
        "result", CsvTableSink(["a"], [DataTypes.INT()], result_file))
    orders = bt_env.scan("Orders")
    result = orders.window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \
        .group_by("w").select("b.sum")
    result.insert_into("result")
    bt_env.execute("session time window batch")
Ejemplo n.º 9
0
def session_time_window_streaming():
    s_env = StreamExecutionEnvironment.get_execution_environment()
    s_env.set_parallelism(1)
    s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
    st_env = StreamTableEnvironment.create(s_env)
    result_file = "/tmp/session_time_window_streaming.csv"
    if os.path.exists(result_file):
        os.remove(result_file)
    st_env \
        .connect(  # declare the external system to connect to
            Kafka()
            .version("0.11")
            .topic("user")
            .start_from_earliest()
            .property("zookeeper.connect", "localhost:2181")
            .property("bootstrap.servers", "localhost:9092")
        ) \
        .with_format(  # declare a format for this system
            Json()
            .fail_on_missing_field(True)
            .json_schema(
                "{"
                "  type: 'object',"
                "  properties: {"
                "    a: {"
                "      type: 'string'"
                "    },"
                "    b: {"
                "      type: 'string'"
                "    },"
                "    c: {"
                "      type: 'string'"
                "    },"
                "    time: {"
                "      type: 'string',"
                "      format: 'date-time'"
                "    }"
                "  }"
                "}"
             )
         ) \
        .with_schema(  # declare the schema of the table
             Schema()
             .field("rowtime", DataTypes.TIMESTAMP())
             .rowtime(
                Rowtime()
                .timestamps_from_field("time")
                .watermarks_periodic_bounded(60000))
             .field("a", DataTypes.STRING())
             .field("b", DataTypes.STRING())
             .field("c", DataTypes.STRING())
         ) \
        .in_append_mode() \
        .register_table_source("source")

    st_env.register_table_sink(
        "result",
        CsvTableSink(
            ["a", "b"],
            [DataTypes.STRING(), DataTypes.STRING()], result_file))

    st_env.scan("source").window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \
        .group_by("w, a") \
        .select("a, max(b)").insert_into("result")

    st_env.execute("session time window streaming")