Beispiel #1
0
 def execute_with_table_execute_insert(self, t_env):
     source = t_env.from_elements([(1, "Hi"), (2, "Hello")], ["a", "b"])
     result = source.select("func1(a, b), func2(a, b)")
     exec_insert_table(result, "sink")
     actual = source_sink_utils.results()
     expected = ['1 and Hi,1 or Hi', '2 and Hello,2 or Hello']
     self.assert_equals(actual, expected)
Beispiel #2
0
    def test_stream_case(self):
        from pyflink.shell import s_env, st_env, FileSystem, OldCsv, DataTypes, Schema
        # example begin

        import tempfile
        import os
        import shutil
        sink_path = tempfile.gettempdir() + '/streaming.csv'
        if os.path.exists(sink_path):
            if os.path.isfile(sink_path):
                os.remove(sink_path)
            else:
                shutil.rmtree(sink_path)
        s_env.set_parallelism(1)
        t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')],
                                 ['a', 'b', 'c'])
        st_env.connect(FileSystem().path(sink_path))\
            .with_format(OldCsv()
                         .field_delimiter(',')
                         .field("a", DataTypes.BIGINT())
                         .field("b", DataTypes.STRING())
                         .field("c", DataTypes.STRING()))\
            .with_schema(Schema()
                         .field("a", DataTypes.BIGINT())
                         .field("b", DataTypes.STRING())
                         .field("c", DataTypes.STRING()))\
            .create_temporary_table("stream_sink")

        exec_insert_table(t.select("a + 1, b, c"), "stream_sink")

        # verify code, do not copy these code to shell.py
        with open(sink_path, 'r') as f:
            lines = f.read()
            self.assertEqual(lines, '2,hi,hello\n' + '3,hi,hello\n')
Beispiel #3
0
    def test_table_environment_with_blink_planner(self):
        t_env = BatchTableEnvironment.create(
            environment_settings=EnvironmentSettings.new_instance(
            ).in_batch_mode().use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/results')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.from_path("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        exec_insert_table(result, "sink")

        results = []
        for root, dirs, files in os.walk(sink_path):
            for sub_file in files:
                with open(os.path.join(root, sub_file), 'r') as f:
                    line = f.readline()
                    while line is not None and line != '':
                        results.append(line)
                        line = f.readline()

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Beispiel #4
0
    def test_add_python_file(self):
        python_file_dir = os.path.join(self.tempdir,
                                       "python_file_dir_" + str(uuid.uuid4()))
        os.mkdir(python_file_dir)
        python_file_path = os.path.join(python_file_dir,
                                        "test_dependency_manage_lib.py")
        with open(python_file_path, 'w') as f:
            f.write("def add_two(a):\n    return a + 2")
        self.t_env.add_python_file(python_file_path)

        def plus_two(i):
            from test_dependency_manage_lib import add_two
            return add_two(i)

        self.t_env.create_temporary_system_function(
            "add_two", udf(plus_two, DataTypes.BIGINT(), DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(t.select(expr.call("add_two", t.a), t.a), "Results")

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1", "4,2", "5,3"])
Beispiel #5
0
    def test_data_types_only_supported_in_blink_planner(self):
        import pandas as pd

        timezone = self.t_env.get_config().get_local_timezone()
        local_datetime = pytz.timezone(timezone).localize(
            datetime.datetime(1970, 1, 2, 0, 0, 0, 123000))

        def local_zoned_timestamp_func(local_zoned_timestamp_param):
            assert isinstance(local_zoned_timestamp_param, pd.Series)
            assert isinstance(local_zoned_timestamp_param[0], datetime.datetime), \
                'local_zoned_timestamp_param of wrong type %s !' % type(
                    local_zoned_timestamp_param[0])
            assert local_zoned_timestamp_param[0] == local_datetime, \
                'local_zoned_timestamp_param is wrong value %s, %s!' % \
                (local_zoned_timestamp_param[0], local_datetime)
            return local_zoned_timestamp_param

        self.t_env.create_temporary_system_function(
            "local_zoned_timestamp_func",
            udf(local_zoned_timestamp_func,
                result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3),
                udf_type="pandas"))

        table_sink = source_sink_utils.TestAppendSink(
            ['a'], [DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(local_datetime,)],
            DataTypes.ROW([DataTypes.FIELD("a", DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))]))

        exec_insert_table(t.select("local_zoned_timestamp_func(local_zoned_timestamp_func(a))"),
                          "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1970-01-02T00:00:00.123Z"])
Beispiel #6
0
    def test_add_python_archive(self):
        tmp_dir = self.tempdir
        archive_dir_path = os.path.join(tmp_dir,
                                        "archive_" + str(uuid.uuid4()))
        os.mkdir(archive_dir_path)
        with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f:
            f.write("2")
        archive_file_path = \
            shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path)
        self.t_env.add_python_archive(archive_file_path, "data")

        def add_from_file(i):
            with open("data/data.txt", 'r') as f:
                return i + int(f.read())

        self.t_env.create_temporary_system_function(
            "add_from_file",
            udf(add_from_file, DataTypes.BIGINT(), DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(t.select(expr.call('add_from_file', t.a), t.a),
                          "Results")

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1", "4,2", "5,3"])
Beispiel #7
0
    def test_basic_functionality(self):
        # pandas UDF
        self.t_env.create_temporary_system_function(
            "add_one",
            udf(lambda i: i + 1,
                result_type=DataTypes.BIGINT(),
                udf_type="pandas"))

        self.t_env.create_temporary_system_function("add", add)

        # general Python UDF
        self.t_env.create_temporary_system_function(
            "subtract_one",
            udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b', 'c', 'd'], [
            DataTypes.BIGINT(),
            DataTypes.BIGINT(),
            DataTypes.BIGINT(),
            DataTypes.BIGINT()
        ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        exec_insert_table(
            t.where("add_one(b) <= 3").select(
                "a, b + 1, add(a + 1, subtract_one(c)) + 2, "
                "add(add_one(a), 1L)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,3,6,3", "3,2,14,5"])
Beispiel #8
0
    def test_set_requirements_without_cached_directory(self):
        requirements_txt_path = os.path.join(self.tempdir, str(uuid.uuid4()))
        with open(requirements_txt_path, 'w') as f:
            f.write("cloudpickle==1.2.2")
        self.t_env.set_python_requirements(requirements_txt_path)

        def check_requirements(i):
            import cloudpickle
            assert os.path.abspath(cloudpickle.__file__).startswith(
                os.environ['_PYTHON_REQUIREMENTS_INSTALL_DIR'])
            return i

        self.t_env.create_temporary_system_function(
            "check_requirements",
            udf(check_requirements, DataTypes.BIGINT(), DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(t.select(expr.call('check_requirements', t.a), t.a),
                          "Results")

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,2", "3,3"])
Beispiel #9
0
    def test_pipeline(self):
        t_env = MLEnvironmentFactory().get_default(
        ).get_stream_table_environment()
        train_table = t_env.from_elements([(1, 2), (1, 4), (1, 0), (10, 2),
                                           (10, 4), (10, 0)], ['a', 'b'])
        serving_table = t_env.from_elements([(0, 0), (12, 3)], ['a', 'b'])

        table_sink = source_sink_utils.TestAppendSink(['predict_result'],
                                                      [DataTypes.BOOLEAN()])
        t_env.register_table_sink("PredictResults", table_sink)

        # transformer, output features column which is the sum of a and b.
        transformer = PythonAddTransformer(selected_cols=["a", "b"],
                                           output_col="features")

        # estimator
        estimator = PythonEstimator()\
            .set_vector_col("features")\
            .set_prediction_col("predict_result")

        # pipeline
        pipeline = Pipeline().append_stage(transformer).append_stage(estimator)
        exec_insert_table(
            pipeline.fit(t_env, train_table).transform(t_env, serving_table),
            'PredictResults')

        actual = source_sink_utils.results()
        # the first input is false since 0 + 0 is smaller than the max_sum 14.
        # the second input is true since 12 + 3 is bigger than the max_sum 14.
        self.assert_equals(actual, ["false", "true"])
Beispiel #10
0
    def test_data_types_only_supported_in_blink_planner(self):
        timezone = self.t_env.get_config().get_local_timezone()
        local_datetime = pytz.timezone(timezone).localize(
            datetime.datetime(1970, 1, 1, 0, 0, 0, 123000))

        @udf(result_type=DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))
        def local_zoned_timestamp_func(local_zoned_timestamp_param):
            assert local_zoned_timestamp_param == local_datetime, \
                'local_zoned_timestamp_param is wrong value %s !' % local_zoned_timestamp_param
            return local_zoned_timestamp_param

        table_sink = source_sink_utils.TestAppendSink(
            ['a'], [DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3)])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements(
            [(local_datetime, )],
            DataTypes.ROW([
                DataTypes.FIELD("a",
                                DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3))
            ]))

        exec_insert_table(
            t.select(
                local_zoned_timestamp_func(local_zoned_timestamp_func(t.a))),
            "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1970-01-01T00:00:00.123Z"])
Beispiel #11
0
    def test_pipeline_from_and_to_java_json(self):
        # json generated from Java api
        java_json = '[{"stageClassName":"org.apache.flink.ml.pipeline.' \
                    'UserDefinedPipelineStages$SelectColumnTransformer",' \
                    '"stageJson":"{\\"selectedCols\\":\\"[\\\\\\"a\\\\\\",' \
                    '\\\\\\"b\\\\\\"]\\"}"}]'

        # load json
        p = Pipeline()
        p.load_json(java_json)
        python_json = p.to_json()

        t_env = MLEnvironmentFactory().get_default(
        ).get_stream_table_environment()

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        t_env.register_table_sink("TestJsonResults", table_sink)

        source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)],
                                           ['a', 'b', 'c', 'd'])
        transformer = p.get_stages()[0]
        exec_insert_table(transformer.transform(t_env, source_table),
                          "TestJsonResults")

        actual = source_sink_utils.results()

        self.assert_equals(actual, ["1,2", "4,3"])
        self.assertEqual(python_json, java_json)
Beispiel #12
0
    def test_table_environment_with_blink_planner(self):
        self.env.set_parallelism(1)
        t_env = StreamTableEnvironment.create(
            self.env,
            environment_settings=EnvironmentSettings.new_instance(
            ).use_blink_planner().build())

        source_path = os.path.join(self.tempdir + '/streaming.csv')
        sink_path = os.path.join(self.tempdir + '/result.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')]
        csv_source = self.prepare_csv_source(source_path, data, field_types,
                                             field_names)

        t_env.register_table_source("source", csv_source)

        t_env.register_table_sink(
            "sink", CsvTableSink(field_names, field_types, sink_path))
        source = t_env.from_path("source")

        result = source.alias("a, b, c").select("1 + a, b, c")

        exec_insert_table(result, "sink")

        results = []
        with open(sink_path, 'r') as f:
            results.append(f.readline())
            results.append(f.readline())

        self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
Beispiel #13
0
 def load_model(self, table_env):
     """
     Train the model to get the max_sum value which is used to predict data.
     """
     table_sink = source_sink_utils.TestRetractSink(["max_sum"],
                                                    [DataTypes.BIGINT()])
     table_env.register_table_sink("Model_Results", table_sink)
     exec_insert_table(self._model_data_table, "Model_Results")
     actual = source_sink_utils.results()
     self.max_sum = actual.apply(0)
Beispiel #14
0
    def test_from_element(self):
        t_env = self.t_env
        field_names = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r"
        ]
        field_types = [
            DataTypes.BIGINT(),
            DataTypes.DOUBLE(),
            DataTypes.STRING(),
            DataTypes.STRING(),
            DataTypes.DATE(),
            DataTypes.TIME(),
            DataTypes.TIMESTAMP(3),
            DataTypes.INTERVAL(DataTypes.SECOND(3)),
            DataTypes.ARRAY(DataTypes.DOUBLE()),
            DataTypes.ARRAY(DataTypes.DOUBLE(False)),
            DataTypes.ARRAY(DataTypes.STRING()),
            DataTypes.ARRAY(DataTypes.DATE()),
            DataTypes.DECIMAL(38, 18),
            DataTypes.ROW([
                DataTypes.FIELD("a", DataTypes.BIGINT()),
                DataTypes.FIELD("b", DataTypes.DOUBLE())
            ]),
            DataTypes.MAP(DataTypes.STRING(), DataTypes.DOUBLE()),
            DataTypes.BYTES(),
            ExamplePointUDT(),
            PythonOnlyUDT()
        ]
        schema = DataTypes.ROW(
            list(
                map(
                    lambda field_name, field_type: DataTypes.FIELD(
                        field_name, field_type), field_names, field_types)))
        table_sink = source_sink_utils.TestAppendSink(field_names, field_types)
        t_env.register_table_sink("Results", table_sink)
        t = t_env.from_elements(
            [(1, 1.0, "hi", "hello", datetime.date(1970, 1, 2),
              datetime.time(1, 0, 0), datetime.datetime(1970, 1, 2, 0, 0),
              datetime.timedelta(days=1, microseconds=10), [1.0, None],
              array.array("d", [1.0, 2.0]), ["abc"],
              [datetime.date(1970, 1, 2)], Decimal(1), Row("a", "b")(1, 2.0), {
                  "key": 1.0
              }, bytearray(b'ABCD'), ExamplePoint(
                  1.0, 2.0), PythonOnlyPoint(3.0, 4.0))], schema)
        exec_insert_table(t, "Results")
        actual = source_sink_utils.results()

        expected = [
            '1,1.0,hi,hello,1970-01-02,01:00:00,1970-01-02 00:00:00.0,'
            '86400000,[1.0, null],[1.0, 2.0],[abc],[1970-01-02],'
            '1,1,2.0,{key=1.0},[65, 66, 67, 68],[1.0, 2.0],[3.0, 4.0]'
        ]
        self.assert_equals(actual, expected)
Beispiel #15
0
    def test_open(self):
        self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true')
        self.t_env.create_temporary_system_function(
            "subtract", udf(Subtract(), result_type=DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 4)], ['a', 'b'])
        exec_insert_table(t.select("a, subtract(b)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,4", "3,3"])
Beispiel #16
0
    def test_overwrite_builtin_function(self):
        self.t_env.create_temporary_system_function(
            "plus", udf(lambda i, j: i + j - 1,
                        result_type=DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(['a'], [DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c'])
        exec_insert_table(t.select("plus(a, b)"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2", "6", "3"])
Beispiel #17
0
    def test_insert_into(self):
        t_env = self.t_env
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "Sinks",
            source_sink_utils.TestAppendSink(field_names, field_types))

        exec_insert_table(t_env.from_elements([(1, "Hi", "Hello")], ["a", "b", "c"]), "Sinks")

        actual = source_sink_utils.results()
        expected = ['1,Hi,Hello']
        self.assert_equals(actual, expected)
Beispiel #18
0
    def test_set_requirements_with_cached_directory(self):
        tmp_dir = self.tempdir
        requirements_txt_path = os.path.join(
            tmp_dir, "requirements_txt_" + str(uuid.uuid4()))
        with open(requirements_txt_path, 'w') as f:
            f.write("python-package1==0.0.0")

        requirements_dir_path = os.path.join(
            tmp_dir, "requirements_dir_" + str(uuid.uuid4()))
        os.mkdir(requirements_dir_path)
        package_file_name = "python-package1-0.0.0.tar.gz"
        with open(os.path.join(requirements_dir_path, package_file_name),
                  'wb') as f:
            import base64
            # This base64 data is encoded from a python package file which includes a
            # "python_package1" module. The module contains a "plus(a, b)" function.
            # The base64 can be recomputed by following code:
            # base64.b64encode(open("python-package1-0.0.0.tar.gz", "rb").read()).decode("utf-8")
            f.write(
                base64.b64decode(
                    "H4sICNefrV0C/2Rpc3QvcHl0aG9uLXBhY2thZ2UxLTAuMC4wLnRhcgDtmVtv2jAYhnPtX2H1CrRCY+ckI"
                    "XEx7axuUA11u5imyICTRc1JiVnHfv1MKKWjYxwKEdPehws7xkmUfH5f+3PyqfqWpa1cjG5EKFnLbOvfhX"
                    "FQTI3nOPPSdavS5Pa8nGMwy3Esi3ke9wyTObbnGNQxamBSKlFQavzUryG8ldG6frpbEGx4yNmDLMp/hPy"
                    "P8b+6fNN613vdP1z8XdteG3+ug/17/F3Hcw1qIv5H54NUYiyUaH2SRRllaYeytkl6IpEdujI2yH2XapCQ"
                    "wSRJRDHt0OveZa//uUfeZonUvUO5bHo+0ZcoVo9bMhFRvGx9H41kWj447aUsR0WUq+pui8arWKggK5Jli"
                    "wGOo/95q79ovXi6/nfyf246Dof/n078fT9KI+X77Xx6BP83bX4Xf5NxT7dz7toO/L8OxjKgeTwpG+KcDp"
                    "sdQjWFVJMipYI+o0MCk4X/t2UYtqI0yPabCHb3f861XcD/Ty/+Y5nLdCzT0dSPo/SmbKsf6un+b7KV+Ls"
                    "W4/D/OoC9w/930P9eGwM75//csrD+Q/6P/P/k9D/oX3988Wqw1bS/tf6tR+s/m3EG/ddBqXO9XKf15C8p"
                    "P9k4HZBtBgzZaVW5vrfKcj+W32W82ygEB9D/Xu9+4/qfP9L/rBv0X1v87yONKRX61/qfzwqjIDzIPTbv/"
                    "7or3/88i0H/tfBFW7s/s/avRInQH06ieEy7tDrQeYHUdRN7wP+n/vf62LOH/pld7f9xz7a5Pfufedy0oP"
                    "86iJI8KxStAq6yLC4JWdbbVbWRikR2z1ZGytk5vauW3QdnBFE6XqwmykazCesAAAAAAAAAAAAAAAAAAAA"
                    "AAAAAAAAAAAAAAOBw/AJw5CHBAFAAAA=="))
        self.t_env.set_python_requirements(requirements_txt_path,
                                           requirements_dir_path)

        def add_one(i):
            from python_package1 import plus
            return plus(i, 1)

        self.t_env.create_temporary_system_function(
            "add_one", udf(add_one, DataTypes.BIGINT(), DataTypes.BIGINT()))
        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(t.select(expr.call('add_one', t.a), t.a), "Results")

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,1", "3,2", "4,3"])
Beispiel #19
0
    def test_udf_without_arguments(self):
        self.t_env.create_temporary_system_function("one", udf(
            lambda: 1, result_type=DataTypes.BIGINT(), deterministic=True))
        self.t_env.create_temporary_system_function("two", udf(
            lambda: 2, result_type=DataTypes.BIGINT(), deterministic=False))

        table_sink = source_sink_utils.TestAppendSink(['a', 'b'],
                                                      [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(t.select("one(), two()"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "1,2", "1,2"])
Beispiel #20
0
    def test_sql_query(self):
        t_env = self.t_env
        source = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hello")], ["a", "b", "c"])
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
        t_env.register_table_sink(
            "sinks",
            source_sink_utils.TestAppendSink(field_names, field_types))

        result = t_env.sql_query("select a + 1, b, c from %s" % source)
        exec_insert_table(result, "sinks")
        actual = source_sink_utils.results()

        expected = ['2,Hi,Hello', '3,Hello,Hello']
        self.assert_equals(actual, expected)
Beispiel #21
0
    def test_udf_in_join_condition(self):
        t1 = self.t_env.from_elements([(2, "Hi")], ['a', 'b'])
        t2 = self.t_env.from_elements([(2, "Flink")], ['c', 'd'])

        self.t_env.create_temporary_system_function("f", udf(lambda i: i,
                                                             result_type=DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd'],
            [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.BIGINT(), DataTypes.STRING()])
        self.t_env.register_table_sink("Results", table_sink)

        exec_insert_table(t1.join(t2).where("f(a) = c"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,Hi,2,Flink"])
Beispiel #22
0
    def test_java_transformer(self):
        t_env = MLEnvironmentFactory().get_default(
        ).get_stream_table_environment()

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        t_env.register_table_sink("TransformerResults", table_sink)

        source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)],
                                           ['a', 'b', 'c', 'd'])
        transformer = WrapperTransformer(selected_cols=["a", "b"])
        exec_insert_table(transformer.transform(t_env, source_table),
                          "TransformerResults")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,2", "4,3"])
Beispiel #23
0
    def test_chaining_scalar_function(self):
        self.t_env.create_temporary_system_function(
            "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT()))
        self.t_env.create_temporary_system_function(
            "subtract_one", udf(SubtractOne(), result_type=DataTypes.BIGINT()))
        self.t_env.create_temporary_system_function("add", add)

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)], ['a', 'b', 'c'])
        exec_insert_table(t.select("add(add_one(a), subtract_one(b)), c, 1"), "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
    def test_from_pandas(self):
        table = self.t_env.from_pandas(self.pdf, self.data_type, 5)
        self.assertEqual(self.data_type, table.get_schema().to_row_data_type())

        table = table.filter("f2 < 2")
        table_sink = source_sink_utils.TestAppendSink(
            self.data_type.field_names(), self.data_type.field_types())
        self.t_env.register_table_sink("Results", table_sink)
        exec_insert_table(table, "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, [
            "1,1,1,1,true,1.1,1.2,hello,[97, 97, 97],"
            "1000000000000000000.010000000000000000,2014-09-13,01:00:01,"
            "1970-01-01 00:00:00.123,[hello, 中文],1,hello,"
            "1970-01-01 00:00:00.123,[1, 2]"
        ])
    def test_get_execution_plan(self):
        tmp_dir = tempfile.gettempdir()
        source_path = os.path.join(tmp_dir + '/streaming.csv')
        tmp_csv = os.path.join(tmp_dir + '/streaming2.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]

        t_env = StreamTableEnvironment.create(self.env)
        csv_source = CsvTableSource(source_path, field_names, field_types)
        t_env.register_table_source("Orders", csv_source)
        t_env.register_table_sink(
            "Results", CsvTableSink(field_names, field_types, tmp_csv))
        exec_insert_table(t_env.from_path("Orders"), "Results")

        plan = self.env.get_execution_plan()

        json.loads(plan)
Beispiel #26
0
    def test_set_environment(self):
        python_exec = sys.executable
        tmp_dir = self.tempdir
        python_exec_link_path = os.path.join(tmp_dir, "py_exec")
        os.symlink(python_exec, python_exec_link_path)
        self.t_env.get_config().set_python_executable(python_exec_link_path)

        def check_python_exec(i):
            import os
            assert os.environ["python"] == python_exec_link_path
            return i

        self.t_env.create_temporary_system_function(
            "check_python_exec",
            udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT()))

        def check_pyflink_gateway_disabled(i):
            try:
                from pyflink.java_gateway import get_gateway
                get_gateway()
            except Exception as e:
                assert str(e).startswith(
                    "It's launching the PythonGatewayServer during Python UDF"
                    " execution which is unexpected.")
            else:
                raise Exception("The gateway server is not disabled!")
            return i

        self.t_env.create_temporary_system_function(
            "check_pyflink_gateway_disabled",
            udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(),
                DataTypes.BIGINT()))

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b'],
            [DataTypes.BIGINT(), DataTypes.BIGINT()])
        self.t_env.register_table_sink("Results", table_sink)
        t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b'])
        exec_insert_table(
            t.select(expr.call('check_python_exec', t.a),
                     expr.call('check_pyflink_gateway_disabled', t.a)),
            "Results")

        actual = source_sink_utils.results()
        self.assert_equals(actual, ["1,1", "2,2", "3,3"])
Beispiel #27
0
    def test_chaining_scalar_function(self):
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c'],
            [DataTypes.BIGINT(),
             DataTypes.BIGINT(),
             DataTypes.INT()])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 1), (2, 5, 2), (3, 1, 3)],
                                     ['a', 'b', 'c'])
        exec_insert_table(
            t.select(add(add_one(t.a), subtract_one(t.b)), t.c, expr.lit(1)),
            "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["3,1,1", "7,2,1", "4,3,1"])
Beispiel #28
0
    def test_scalar_function(self):
        # test metric disabled.
        self.t_env.get_config().get_configuration().set_string(
            'python.metric.enabled', 'false')
        # test lambda function
        add_one = udf(lambda i: i + 1, result_type=DataTypes.BIGINT())

        # test Python ScalarFunction
        subtract_one = udf(SubtractOne(), result_type=DataTypes.BIGINT())

        # test callable function
        add_one_callable = udf(CallablePlus(), result_type=DataTypes.BIGINT())

        def partial_func(col, param):
            return col + param

        # test partial function
        import functools
        add_one_partial = udf(functools.partial(partial_func, param=1),
                              result_type=DataTypes.BIGINT())

        table_sink = source_sink_utils.TestAppendSink(
            ['a', 'b', 'c', 'd', 'e', 'f'], [
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT(),
                DataTypes.BIGINT()
            ])
        self.t_env.register_table_sink("Results", table_sink)

        t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)],
                                     ['a', 'b', 'c'])
        exec_insert_table(
            t.where(add_one(t.b) <= 3).select(add_one(t.a), subtract_one(t.b),
                                              add(t.a, t.c),
                                              add_one_callable(t.a),
                                              add_one_partial(t.a), t.a),
            "Results")
        actual = source_sink_utils.results()
        self.assert_equals(actual, ["2,1,4,2,2,1", "4,0,12,4,4,3"])
Beispiel #29
0
    def test_register_temporary_table(self):
        self.env.set_parallelism(1)
        source_path = os.path.join(self.tempdir + '/streaming.csv')
        field_names = ["a", "b", "c"]
        field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()]
        data = [(1, "Hi", "Hello"), (2, "Hello", "Hello")]
        self.prepare_csv_source(source_path, data, field_types, field_names)
        sink_path = os.path.join(self.tempdir + '/streaming2.csv')
        if os.path.isfile(sink_path):
            os.remove(sink_path)
        t_env = self.t_env

        t_env.connect(FileSystem().path(source_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .create_temporary_table("source")
        t_env.connect(FileSystem().path(sink_path))\
             .with_format(OldCsv()
                          .field_delimiter(',')
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .with_schema(Schema()
                          .field("a", DataTypes.INT())
                          .field("b", DataTypes.STRING())
                          .field("c", DataTypes.STRING()))\
             .create_temporary_table("sink")
        exec_insert_table(
            t_env.from_path("source").select("a + 1, b, c"), "sink")

        with open(sink_path, 'r') as f:
            lines = f.read()
            assert lines == '2,Hi,Hello\n' + "3,Hello,Hello\n"
 def test_execute(self):
     tmp_dir = tempfile.gettempdir()
     field_names = ['a', 'b', 'c']
     field_types = [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]
     t_env = StreamTableEnvironment.create(self.env)
     t_env.register_table_sink(
         'Results',
         CsvTableSink(field_names, field_types,
                      os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time())))))
     execution_result = exec_insert_table(
         t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c']),
         'Results')
     self.assertIsNotNone(execution_result.get_job_id())
     self.assertIsNotNone(execution_result.get_net_runtime())
     self.assertEqual(len(execution_result.get_all_accumulator_results()), 0)
     self.assertIsNone(execution_result.get_accumulator_result('accumulator'))
     self.assertIsNotNone(str(execution_result))