Beispiel #1
0
    def test_union_all(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello"), (2, "Hi", "Hello"), (3, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        source_path2 = os.path.join(self.tempdir + '/streaming2.csv')
        data2 = [(2, "Hi", "Hello"), (3, "Hello", "Python"), (4, "Hi", "Flink")]
        csv_source2 = self.prepare_csv_source(source_path2, data2, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source1", csv_source)
        t_env.register_table_source("Source2", csv_source2)
        source1 = t_env.scan("Source1")
        source2 = t_env.scan("Source2")
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        result = source1.union_all(source2)
        result.insert_into("Results")
        t_env.execute()

        actual = source_sink_utils.results()
        expected = ['1,Hi,Hello',
                    '2,Hi,Hello',
                    '2,Hi,Hello',
                    '3,Hello,Hello',
                    '3,Hello,Python',
                    '4,Hi,Flink']
        self.assert_equals(actual, expected)
Beispiel #2
0
    def test_select(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        with open(source_path, 'w') as f:
            lines = '1,hi,hello\n' + '2,hi,hello\n'
            f.write(lines)
            f.close()
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env = self.t_env
        # register Orders table in table environment
        t_env.register_table_source(
            "Orders",
            CsvTableSource(source_path, field_names, field_types))
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        t_env.scan("Orders") \
             .select("a + 1, b, c") \
             .insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,hi,hello', '3,hi,hello']
        self.assert_equals(actual, expected)
Beispiel #3
0
    def test_left_outer_join_without_where(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hi", "Hello"), (3, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        source_path2 = os.path.join(self.tempdir + '/streaming2.csv')
        field_names2 = ["d", "e"]
        field_types2 = [DataTypes.INT(), DataTypes.STRING()]
        data2 = [(2, "Flink"), (3, "Python"), (3, "Flink")]
        csv_source2 = self.prepare_csv_source(source_path2, data2, field_types2, field_names2)
        t_env = self.t_env
        t_env.register_table_source("Source1", csv_source)
        t_env.register_table_source("Source2", csv_source2)
        source1 = t_env.scan("Source1")
        source2 = t_env.scan("Source2")
        field_names = ["a", "b"]
        field_types = [DataTypes.INT(), DataTypes.STRING()]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestRetractSink())

        result = source1.left_outer_join(source2, "a = d").select("a, b + e")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,null', '2,HiFlink', '3,HelloPython', '3,HelloFlink']
        self.assert_equals(actual, expected)
Beispiel #4
0
    def test_open(self):
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'true')
        self.t_env.register_function(
            "subtract", udf(Subtract(), result_type=DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b'])
        t.select("a, subtract(b)").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,4", "3,3"])
Beispiel #5
0
    def test_overwrite_builtin_function(self):
        self.t_env.register_function(
            "plus", udf(lambda i, j: i + j - 1,
                        result_type=DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(['a'],
                                                      [DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.select("plus(a, b)").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2", "6", "3"])
Beispiel #6
0
    def test_overwrite_builtin_function(self):
        self.t_env.create_temporary_system_function(
            "plus", udf(lambda i, j: i + j - 1,
                        result_type=DataTypes.BIGINT()))

        sink_table_ddl = """
                        CREATE TABLE Results(a BIGINT) WITH ('connector'='test-sink')
                        """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.select(t.a + t.b).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[2]", "+I[6]", "+I[3]"])
Beispiel #7
0
    def test_udf_in_join_condition_2(self):
        t1 = self.t_env.from_elements([(1, "Hi"), (2, "Hi")], ['a', 'b'])
        t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd'])

        self.t_env.create_temporary_system_function("f", udf(lambda i: i,
                                                             result_type=DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING()])
        self.t_env.register_table_sink("Results", table_sink)

        exec_insert_table(t1.join(t2).where("f(a) = f(c)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,Hi,2,Flink"])
Beispiel #8
0
    def test_sql_query(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "sinks",
            source_sink_utils.TestAppendSink(field_names, field_types))

        result = t_env.sql_query("select a + 1, b, c from %s" % source)
        result.execute_insert("sinks").wait()
        actual = source_sink_utils.results()

        expected = ['+I[2, Hi, Hello]', '+I[3, Hello, Hello]']
        self.assert_equals(actual, expected)
Beispiel #9
0
    def test_sql_update(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "sinks",
            source_sink_utils.TestAppendSink(field_names, field_types))

        t_env.sql_update("insert into sinks select * from %s" % source)
        self.t_env.execute("test_sql_job")

        actual = source_sink_utils.results()
        expected = ['+I[1, Hi, Hello]', '+I[2, Hello, Hello]']
        self.assert_equals(actual, expected)
Beispiel #10
0
    def test_udf_without_arguments(self):
        self.t_env.register_function("one", udf(
            lambda: 1, input_types=[], result_type=DataTypes.BIGINT(), deterministic=True))
        self.t_env.register_function("two", udf(
            lambda: 2, input_types=[], result_type=DataTypes.BIGINT(), deterministic=False))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b'],
                                                      [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select("one(), two()").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "1,2", "1,2"])
Beispiel #11
0
    def test_udf_in_join_condition(self):
        t1 = self.t_env.from_elements([(2, "Hi")], ['a', 'b'])
        t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd'])

        self.t_env.register_function("f", udf(lambda i: i, DataTypes.BIGINT(), DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING()])
        self.t_env.register_table_sink("Results", table_sink)

        t1.join(t2).where("f(a) = c").insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,Hi,2,Flink"])
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b",
                                 DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()),
                                                DataTypes.FIELD("d", DataTypes.INT())]))]))

        sink_table_ddl = """
        CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        def func3(x):
            assert isinstance(x, Row)
            return x

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW(
                             [DataTypes.FIELD("c", DataTypes.BIGINT()),
                              DataTypes.FIELD("d", DataTypes.BIGINT())]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("c", DataTypes.BIGINT()),
                                DataTypes.FIELD("d", DataTypes.BIGINT())]),
                           func_type='pandas')

        general_udf = udf(func3,
                          result_type=DataTypes.ROW(
                              [DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())]))

        t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
Beispiel #13
0
    def test_scalar_function(self):
        # test lambda function
        self.t_env.register_function(
            "add_one",
            udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT()))

        # test Python ScalarFunction
        self.t_env.register_function(
            "subtract_one",
            udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        # test Python function
        self.t_env.register_function("add", add)

        # test callable function
        self.t_env.register_function(
            "add_one_callable",
            udf(CallablePlus(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        self.t_env.register_function(
            "add_one_partial",
            udf(functools.partial(partial_func, param=1), DataTypes.BIGINT(),
                DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e'], [
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT()
            ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.where("add_one(b) <= 3") \
            .select("add_one(a), subtract_one(b), add(a, c), add_one_callable(a), "
                    "add_one_partial(a)") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,1,4,2,2", "4,0,12,4,4"])
Beispiel #14
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'false')
        # test lambda function
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())

        # test Python ScalarFunction
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        # test callable function
        add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT())

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        add_one_partial = udf(functools.partial(partial_func, param=1),
                              result_type=DataTypes.BIGINT())

        # check memory limit is set
        @udf(result_type=DataTypes.BIGINT())
        def check_memory_limit():
            assert os.environ['_PYTHON_WORKER_MEMORY_LIMIT'] is not None
            return 1

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g'], [
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT()
            ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        t.where(add_one(t.b) <= 3).select(
            add_one(t.a), subtract_one(t.b), add(t.a, t.c), add_one_callable(t.a),
            add_one_partial(t.a), check_memory_limit(), t.a) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual, ["+I[2, 1, 4, 2, 2, 1, 1]", "+I[4, 0, 12, 4, 4, 1, 3]"])
    def test_map_with_pandas_udf(self):
        t = self.t_env.from_elements(
            [(1, Row(2, 3)), (2, Row(1, 3)), (1, Row(5, 4)), (1, Row(8, 6)), (2, Row(3, 4))],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b",
                                 DataTypes.ROW([DataTypes.FIELD("c", DataTypes.INT()),
                                                DataTypes.FIELD("d", DataTypes.INT())]))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        def func(x):
            import pandas as pd
            res = pd.concat([x.a, x.c + x.d], axis=1)
            return res

        def func2(x):
            return x * 2

        def func3(x):
            assert isinstance(x, Row)
            return x

        pandas_udf = udf(func,
                         result_type=DataTypes.ROW(
                             [DataTypes.FIELD("c", DataTypes.BIGINT()),
                              DataTypes.FIELD("d", DataTypes.BIGINT())]),
                         func_type='pandas')

        pandas_udf_2 = udf(func2,
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("c", DataTypes.BIGINT()),
                                DataTypes.FIELD("d", DataTypes.BIGINT())]),
                           func_type='pandas')

        general_udf = udf(func3,
                          result_type=DataTypes.ROW(
                              [DataTypes.FIELD("c", DataTypes.BIGINT()),
                               DataTypes.FIELD("d", DataTypes.BIGINT())]))

        t.map(pandas_udf).map(pandas_udf_2).map(general_udf).execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(
            actual,
            ["+I[4, 8]", "+I[2, 10]", "+I[2, 28]", "+I[2, 18]", "+I[4, 14]"])
Beispiel #16
0
    def test_sliding_group_window_over_count(self):
        self.t_env.get_config().get_configuration().set_string(
            "parallelism.default", "1")
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00',
            '3,3,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_count.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_sum", SumAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                protime as PROCTIME()
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'd'],
            [DataTypes.TINYINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t.window(Slide.over("2.rows").every("1.rows").on("protime").alias("w")) \
            .group_by("a, w") \
            .select("a, my_sum(c) as b") \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
Beispiel #17
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        sink_table_ddl = """
                CREATE TABLE Results(a BIGINT, b BIGINT, c INT) WITH ('connector'='test-sink')
                """
        self.t_env.execute_sql(sink_table_ddl)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[3, 1, 1]", "+I[7, 2, 1]", "+I[4, 3, 1]"])
Beispiel #18
0
    def test_filter(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')], ['a', 'b', 'c'])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        result = t.filter("a > 1 && b = 'Hello'")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,Hello,Hello']
        self.assert_equals(actual, expected)
    def test_drop_columns(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')],
                                ['a', 'b', 'c'])
        field_names = ["b"]
        field_types = [DataTypes.STRING()]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        result = t.select("a, b, c").drop_columns("a, c").select("b")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['Hi', 'Hello']
        self.assert_equals(actual, expected)
    def test_add_or_replace_columns(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello')],
                                ['a', 'b', 'c'])
        field_names = ["b", "a"]
        field_types = [DataTypes.BIGINT(), DataTypes.BIGINT()]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestAppendSink())

        result = t.select("a").add_or_replace_columns("a + 1 as b, a + 2 as a")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['3,2', '4,3']
        self.assert_equals(actual, expected)
Beispiel #21
0
    def test_java_transformer(self):
        t_env = MLEnvironmentFactory().get_default(
        ).get_stream_table_environment()

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        t_env.register_table_sink("TransformerResults", table_sink)

        source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)],
                                           ['a', 'b', 'c', 'd'])
        transformer = WrapperTransformer(selected_cols=["a", "b"])
        exec_insert_table(transformer.transform(t_env, source_table),
                          "TransformerResults")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "4,3"])
Beispiel #22
0
    def test_distinct(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello"),
                                 (2, "Hello", "Hello")], ['a', 'b', 'c'])
        field_names = ["a", "b"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING()]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestRetractSink())

        result = t.distinct().select("a, c as b")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,Hello', '2,Hello']
        self.assert_equals(actual, expected)
Beispiel #23
0
    def test_chaining_scalar_function(self):
        self.t_env.create_temporary_system_function(
            "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT()))
        self.t_env.create_temporary_system_function(
            "subtract_one", udf(SubtractOne(), result_type=DataTypes.BIGINT()))
        self.t_env.create_temporary_system_function("add", add)

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c'])
        exec_insert_table(t.select("add(add_one(a), subtract_one(b)), c, 1"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
    def test_from_pandas(self):
        table = self.t_env.from_pandas(self.pdf, self.data_type, 5)
        self.assertEqual(self.data_type, table.get_schema().to_row_data_type())

        table = table.filter(table.f2 < 2)
        table_sink = source_sink_utils.TestAppendSink(
            self.data_type.field_names(), self.data_type.field_types())
        self.t_env.register_table_sink("Results", table_sink)
        table.execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,1,1,1,true,1.1,1.2,hello,[97, 97, 97],"
            "1000000000000000000.010000000000000000,2014-09-13,01:00:01,"
            "1970-01-01 00:00:00.123,[hello, 中文],1,hello,"
            "1970-01-01 00:00:00.123,[1, 2]"
        ])
Beispiel #25
0
    def test_select(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c'])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        t.select("a + 1, b, c") \
            .insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,hi,hello', '3,hi,hello']
        self.assert_equals(actual, expected)
Beispiel #26
0
    def test_chaining_scalar_function(self):
        self.t_env.register_function(
            "add_one", udf(lambda i: i + 1, DataTypes.BIGINT(), DataTypes.BIGINT()))
        self.t_env.register_function(
            "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()))
        self.t_env.register_function("add", add)

        table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select("add(add_one(a), subtract_one(b))") \
            .insert_into("Results")
        self.t_env.execute("test")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3", "7", "4"])
Beispiel #27
0
    def test_group_by(self):
        t_env = self.t_env
        t = t_env.from_elements([(1, 'Hi', 'Hello'), (2, 'Hello', 'Hello'),
                                 (2, 'Hello', 'Hello')], ['a', 'b', 'c'])
        field_names = ["a", "b"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING()]
        t_env.register_table_sink("Results", field_names, field_types,
                                  source_sink_utils.TestRetractSink())

        result = t.group_by("c").select("a.sum, c as b")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['5,Hello']
        self.assert_equals(actual, expected)
Beispiel #28
0
    def test_sliding_group_window_over_count(self):
        self.t_env.get_config().set("parallelism.default", "1")
        # create source file path
        tmp_dir = self.tempdir
        data = [
            '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00',
            '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00',
            '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00',
            '3,3,3,2018-03-11 03:30:00'
        ]
        source_path = tmp_dir + '/test_sliding_group_window_over_count.csv'
        with open(source_path, 'w') as fd:
            for ele in data:
                fd.write(ele + '\n')

        self.t_env.register_function("my_sum", SumAggregateFunction())

        source_table = """
            create table source_table(
                a TINYINT,
                b SMALLINT,
                c SMALLINT,
                protime as PROCTIME()
            ) with(
                'connector.type' = 'filesystem',
                'format.type' = 'csv',
                'connector.path' = '%s',
                'format.ignore-first-line' = 'false',
                'format.field-delimiter' = ','
            )
        """ % source_path
        self.t_env.execute_sql(source_table)
        t = self.t_env.from_path("source_table")

        from pyflink.testing import source_sink_utils
        sink_table_ddl = """
        CREATE TABLE Results(a TINYINT, d BIGINT) WITH ('connector'='test-sink')
        """
        self.t_env.execute_sql(sink_table_ddl)
        t.window(Slide.over(row_interval(2)).every(row_interval(1)).on(t.protime).alias("w")) \
            .group_by(t.a, col("w")) \
            .select(t.a, call("my_sum", t.c).alias("b")) \
            .execute_insert("Results") \
            .wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
Beispiel #29
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)) \
            .execute_insert("Results").wait()
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
Beispiel #30
0
    def test_window_aggregate_with_pandas_udaf(self):
        import datetime
        from pyflink.table.window import Tumble
        t = self.t_env.from_elements(
            [
                (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)),
                (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)),
                (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)),
                (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0))
            ],
            DataTypes.ROW(
                [DataTypes.FIELD("a", DataTypes.TINYINT()),
                 DataTypes.FIELD("b", DataTypes.SMALLINT()),
                 DataTypes.FIELD("c", DataTypes.INT()),
                 DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))]))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [
                DataTypes.TIMESTAMP(3),
                DataTypes.FLOAT(),
                DataTypes.INT()
            ])
        self.t_env.register_table_sink("Results", table_sink)
        pandas_udaf = udaf(lambda pd: (pd.b.mean(), pd.b.max()),
                           result_type=DataTypes.ROW(
                               [DataTypes.FIELD("a", DataTypes.FLOAT()),
                                DataTypes.FIELD("b", DataTypes.INT())]),
                           func_type="pandas")
        tumble_window = Tumble.over(expr.lit(1).hours) \
            .on(expr.col("rowtime")) \
            .alias("w")
        t.select(t.b, t.rowtime) \
            .window(tumble_window) \
            .group_by("w") \
            .aggregate(pandas_udaf.alias("d", "e")) \
            .select("w.rowtime, d, e") \
            .execute_insert("Results") \
            .wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual,
                           ["2018-03-11 03:59:59.999,2.2,3",
                            "2018-03-11 04:59:59.999,8.0,8"])
Beispiel #31
0
    def test_set_environment(self):
        python_exec = sys.executable
        tmp_dir = self.tempdir
        python_exec_link_path = os.path.join(tmp_dir, "py_exec")
        os.symlink(python_exec, python_exec_link_path)
        self.t_env.get_config().set_python_executable(python_exec_link_path)

        def check_python_exec(i):
            import os
            assert os.environ["python"] == python_exec_link_path
            return i

        self.t_env.create_temporary_system_function(
            "check_python_exec",
            udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT()))

        def check_pyflink_gateway_disabled(i):
            try:
                from pyflink.java_gateway import get_gateway
                get_gateway()
            except Exception as e:
                assert str(e).startswith(
                    "It's launching the PythonGatewayServer during Python UDF"
                    " execution which is unexpected.")
            else:
                raise Exception("The gateway server is not disabled!")
            return i

        self.t_env._remote_mode = True
        self.t_env.create_temporary_system_function(
            "check_pyflink_gateway_disabled",
            udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(),
                DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        t.select(
            expr.call('check_python_exec', t.a),
            expr.call('check_pyflink_gateway_disabled', t.a)) \
            .execute_insert("Results").wait()

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
Beispiel #32
0
    def test_udf_without_arguments(self):
        one = udf(lambda: 1,
                  result_type=DataTypes.BIGINT(),
                  deterministic=True)
        two = udf(lambda: 2,
                  result_type=DataTypes.BIGINT(),
                  deterministic=False)

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(t.select(one(), two()), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "1,2", "1,2"])
Beispiel #33
0
    def test_udf_in_join_condition(self):
        t1 = self.t_env.from_elements([(2, "Hi")], ['a', 'b'])
        t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd'])

        f = udf(lambda i: i, result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [
            DataTypes.BIGINT(),
            DataTypes.STRING(),
            DataTypes.BIGINT(),
            DataTypes.STRING()
        ])
        self.t_env.register_table_sink("Results", table_sink)

        exec_insert_table(t1.join(t2).where(f(t1.a) == t2.c), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,Hi,2,Flink"])
    def test_sql_update(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("source", csv_source)
        t_env.register_table_sink(
            "sinks",
            field_names, field_types, source_sink_utils.TestAppendSink())

        t_env.sql_update("insert into sinks select * from source")
        t_env.execute("test_sql_job")

        actual = source_sink_utils.results()
        expected = ['1,Hi,Hello', '2,Hello,Hello']
        self.assert_equals(actual, expected)
    def test_from_table_source(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hi", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_sink(
            "Sinks",
            field_names, field_types, source_sink_utils.TestAppendSink())

        source = t_env.from_table_source(csv_source)
        source.insert_into("Sinks")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,Hi,Hello', '2,Hi,Hello']
        self.assert_equals(actual, expected)
    def test_sql_query(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")
        t_env.register_table_sink(
            "sinks",
            field_names, field_types, source_sink_utils.TestAppendSink())

        result = t_env.sql_query("select a + 1, b, c from %s" % source)
        result.insert_into("sinks")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,Hi,Hello', '3,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #37
0
    def test_filter(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        result = source.filter("a > 1 && b = 'Hello'")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['2,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #38
0
    def test_rename_columns(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")
        field_names = ["d", "e", "f"]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        result = source.select("a, b, c").rename_columns("a as d, c as f, b as e").select("d, e, f")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,Hi,Hello', '2,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #39
0
    def test_add_columns(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")
        field_types = [DataTypes.INT, DataTypes.INT, DataTypes.INT]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestAppendSink())

        result = source.select("a").add_columns("a + 1 as b, a + 2 as c")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,2,3', '2,3,4']
        self.assert_equals(actual, expected)
    def test_sql_update_with_query_config(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("source", csv_source)
        t_env.register_table_sink(
            "sinks",
            field_names, field_types, source_sink_utils.TestAppendSink())
        query_config = t_env.query_config()
        query_config.with_idle_state_retention_time(
            datetime.timedelta(days=1), datetime.timedelta(days=2))

        t_env.sql_update("insert into sinks select * from source", query_config)
        t_env.execute("test_sql_job")

        actual = source_sink_utils.results()
        expected = ['1,Hi,Hello', '2,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #41
0
    def test_distinct(self):
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello"), (2, "Hello", "Hello")]
        csv_source = self.prepare_csv_source(source_path, data, field_types, field_names)
        t_env = self.t_env
        t_env.register_table_source("Source", csv_source)
        source = t_env.scan("Source")
        field_names = ["a", "b"]
        field_types = [DataTypes.INT(), DataTypes.STRING()]
        t_env.register_table_sink(
            "Results",
            field_names, field_types, source_sink_utils.TestRetractSink())

        result = source.distinct().select("a, c as b")
        result.insert_into("Results")
        t_env.execute()
        actual = source_sink_utils.results()

        expected = ['1,Hello', '2,Hello']
        self.assert_equals(actual, expected)