def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) # used to test pipeline.jars and pipeline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().set(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) t_env.execute_sql( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t = t_env.from_elements(elements, ["word", "count"]) t.select(t.word, call("add_one", t.count).alias("count"), call("add_one_java", t.count).alias("count_java")) \ .group_by(t.word) \ .select(t.word, col("count").count.alias("count"), col("count_java").count.alias("count_java")) \ .execute_insert("Results")
def test_set_environment(self): python_exec_link_path = sys.executable self.st_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.st_env.create_temporary_system_function( "check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): from pyflink.java_gateway import get_gateway get_gateway() return i self.st_env.create_temporary_system_function( "check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.st_env.register_table_sink("Results", table_sink) t = self.st_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select( expr.call('check_python_exec', t.a), expr.call('check_pyflink_gateway_disabled', t.a)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_set_environment(self): python_exec_link_path = sys.executable self.st_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.st_env.create_temporary_system_function( "check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): from pyflink.java_gateway import get_gateway get_gateway() return i self.st_env.create_temporary_system_function( "check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.st_env.execute_sql(sink_table_ddl) t = self.st_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select( expr.call('check_python_exec', t.a), expr.call('check_pyflink_gateway_disabled', t.a)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_add_python_file(self): python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dependency_manage_lib.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n raise Exception('This function should not be called!')") self.t_env.add_python_file(python_file_path) python_file_dir_with_higher_priority = os.path.join( self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir_with_higher_priority) python_file_path_higher_priority = os.path.join(python_file_dir_with_higher_priority, "test_dependency_manage_lib.py") with open(python_file_path_higher_priority, 'w') as f: f.write("def add_two(a):\n return a + 2") self.t_env.add_python_file(python_file_path_higher_priority) def plus_two(i): from test_dependency_manage_lib import add_two return add_two(i) self.t_env.create_temporary_system_function( "add_two", udf(plus_two, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(expr.call("add_two", t.a), t.a).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 1]", "+I[4, 2]", "+I[5, 3]"])
def test_set_requirements_without_cached_directory(self): requirements_txt_path = os.path.join(self.tempdir, str(uuid.uuid4())) with open(requirements_txt_path, 'w') as f: f.write("cloudpickle==1.2.2") self.t_env.set_python_requirements(requirements_txt_path) def check_requirements(i): import cloudpickle assert os.path.abspath(cloudpickle.__file__).startswith( os.environ['_PYTHON_REQUIREMENTS_INSTALL_DIR']) return i self.t_env.create_temporary_system_function( "check_requirements", udf(check_requirements, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(expr.call('check_requirements', t.a), t.a).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_add_python_archive(self): tmp_dir = self.tempdir archive_dir_path = os.path.join(tmp_dir, "archive_" + str(uuid.uuid4())) os.mkdir(archive_dir_path) with open(os.path.join(archive_dir_path, "data.txt"), 'w') as f: f.write("2") archive_file_path = \ shutil.make_archive(os.path.dirname(archive_dir_path), 'zip', archive_dir_path) self.t_env.add_python_archive(archive_file_path, "data") def add_from_file(i): with open("data/data.txt", 'r') as f: return i + int(f.read()) self.t_env.create_temporary_system_function( "add_from_file", udf(add_from_file, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(expr.call('add_from_file', t.a), t.a).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 1]", "+I[4, 2]", "+I[5, 3]"])
def test_add_python_file(self): python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dependency_manage_lib.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") self.t_env.add_python_file(python_file_path) def plus_two(i): from test_dependency_manage_lib import add_two return add_two(i) self.t_env.create_temporary_system_function( "add_two", udf(plus_two, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) exec_insert_table(t.select(expr.call("add_two", t.a), t.a), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["3,1", "4,2", "5,3"])
def test_group_aggregate_with_aux_group(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b INT, c FLOAT, d INT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) self.t_env.get_config().get_configuration().set_string('python.metric.enabled', 'true') self.t_env.get_config().set('python.metric.enabled', 'true') self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) t.group_by(t.a) \ .select(t.a, (t.a + 1).alias("b"), (t.a + 2).alias("c")) \ .group_by(t.a, t.b) \ .select(t.a, t.b, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2, 2.0, 6]", "+I[2, 3, 3.0, 8]", "+I[3, 4, 4.0, 10]"])
def test_group_aggregate_with_aux_group(self): t = self.t_env.from_elements( [(1, 2, 3), (3, 2, 3), (2, 1, 3), (1, 5, 4), (1, 8, 6), (2, 3, 4)], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT())])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.TINYINT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.INT()]) self.t_env.register_table_sink("Results", table_sink) self.t_env.get_config().set('python.metric.enabled', 'true') self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) t.group_by(t.a) \ .select(t.a, (t.a + 1).alias("b"), (t.a + 2).alias("c")) \ .group_by(t.a, t.b) \ .select(t.a, t.b, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2, 2.0, 6]", "+I[2, 3, 3.0, 8]", "+I[3, 4, 4.0, 10]"])
def test_tumbling_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:30:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', ] source_path = tmp_dir + '/test_tumbling_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.create_temporary_system_function( "my_count", CountDistinctAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c INT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils sink_table_ddl = """ CREATE TABLE Results(a TINYINT, b TIMESTAMP(3), c TIMESTAMP(3), d BIGINT, e BIGINT) WITH ('connector'='test-sink') """ self.t_env.execute_sql(sink_table_ddl) t.window(Tumble.over(lit(1).hours).on(t.rowtime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, t.c.count.alias("c"), call("my_count", t.c).alias("d")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, [ "+I[2, 2018-03-11T03:00, 2018-03-11T04:00, 2, 1]", "+I[3, 2018-03-11T03:00, 2018-03-11T04:00, 1, 1]", "+I[1, 2018-03-11T03:00, 2018-03-11T04:00, 2, 2]", "+I[1, 2018-03-11T04:00, 2018-03-11T05:00, 1, 1]" ])
def test_set_environment(self): python_exec = sys.executable tmp_dir = self.tempdir python_exec_link_path = os.path.join(tmp_dir, "py_exec") os.symlink(python_exec, python_exec_link_path) self.t_env.get_config().set_python_executable(python_exec_link_path) def check_python_exec(i): import os assert os.environ["python"] == python_exec_link_path return i self.t_env.create_temporary_system_function( "check_python_exec", udf(check_python_exec, DataTypes.BIGINT(), DataTypes.BIGINT())) def check_pyflink_gateway_disabled(i): try: from pyflink.java_gateway import get_gateway get_gateway() except Exception as e: assert str(e).startswith( "It's launching the PythonGatewayServer during Python UDF" " execution which is unexpected.") else: raise Exception("The gateway server is not disabled!") return i self.t_env._remote_mode = True self.t_env.create_temporary_system_function( "check_pyflink_gateway_disabled", udf(check_pyflink_gateway_disabled, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select( expr.call('check_python_exec', t.a), expr.call('check_pyflink_gateway_disabled', t.a)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_session_group_window_over_time(self): # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_session_group_window_over_time.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_count", CountAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, rowtime TIMESTAMP(3), WATERMARK FOR rowtime AS rowtime - INTERVAL '60' MINUTE ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Session.with_gap(lit(30).minutes).on(t.rowtime).alias("w")) \ .group_by(t.a, t.b, col("w")) \ .select(t.a, col("w").start, col("w").end, call("my_count", t.c).alias("c")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[3, 2018-03-11 03:10:00.0, 2018-03-11 03:40:00.0, 1]", "+I[2, 2018-03-11 03:10:00.0, 2018-03-11 04:00:00.0, 2]", "+I[1, 2018-03-11 03:10:00.0, 2018-03-11 04:10:00.0, 2]", "+I[1, 2018-03-11 04:20:00.0, 2018-03-11 04:50:00.0, 1]"])
def test_expressions(self): expr1 = col('a') expr2 = col('b') expr3 = col('c') self.assertEqual('10', str(lit(10, DataTypes.INT(False)))) self.assertEqual('rangeTo(1, 2)', str(range_(1, 2))) self.assertEqual('and(a, b, c)', str(and_(expr1, expr2, expr3))) self.assertEqual('or(a, b, c)', str(or_(expr1, expr2, expr3))) from pyflink.table.expressions import UNBOUNDED_ROW, UNBOUNDED_RANGE, CURRENT_ROW, \ CURRENT_RANGE self.assertEqual('unboundedRow()', str(UNBOUNDED_ROW)) self.assertEqual('unboundedRange()', str(UNBOUNDED_RANGE)) self.assertEqual('currentRow()', str(CURRENT_ROW)) self.assertEqual('currentRange()', str(CURRENT_RANGE)) self.assertEqual('currentDate()', str(current_date())) self.assertEqual('currentTime()', str(current_time())) self.assertEqual('currentTimestamp()', str(current_timestamp())) self.assertEqual('localTime()', str(local_time())) self.assertEqual('localTimestamp()', str(local_timestamp())) self.assertEquals('toTimestampLtz(123, 0)', str(to_timestamp_ltz(123, 0))) self.assertEqual("temporalOverlaps(cast('2:55:00', TIME(0)), 3600000, " "cast('3:30:00', TIME(0)), 7200000)", str(temporal_overlaps( lit("2:55:00").to_time, lit(1).hours, lit("3:30:00").to_time, lit(2).hours))) self.assertEqual("dateFormat(time, '%Y, %d %M')", str(date_format(col("time"), "%Y, %d %M"))) self.assertEqual("timestampDiff(DAY, cast('2016-06-15', DATE), cast('2016-06-18', DATE))", str(timestamp_diff( TimePointUnit.DAY, lit("2016-06-15").to_date, lit("2016-06-18").to_date))) self.assertEqual('array(1, 2, 3)', str(array(1, 2, 3))) self.assertEqual("row('key1', 1)", str(row("key1", 1))) self.assertEqual("map('key1', 1, 'key2', 2, 'key3', 3)", str(map_("key1", 1, "key2", 2, "key3", 3))) self.assertEqual('4', str(row_interval(4))) self.assertEqual('pi()', str(pi())) self.assertEqual('e()', str(e())) self.assertEqual('rand(4)', str(rand(4))) self.assertEqual('randInteger(4)', str(rand_integer(4))) self.assertEqual('atan2(1, 2)', str(atan2(1, 2))) self.assertEqual('minusPrefix(a)', str(negative(expr1))) self.assertEqual('concat(a, b, c)', str(concat(expr1, expr2, expr3))) self.assertEqual("concat_ws(', ', b, c)", str(concat_ws(', ', expr2, expr3))) self.assertEqual('uuid()', str(uuid())) self.assertEqual('null', str(null_of(DataTypes.BIGINT()))) self.assertEqual('log(a)', str(log(expr1))) self.assertEqual('ifThenElse(a, b, c)', str(if_then_else(expr1, expr2, expr3))) self.assertEqual('withColumns(a, b, c)', str(with_columns(expr1, expr2, expr3))) self.assertEqual('a.b.c(a)', str(call('a.b.c', expr1)))
def test_slide_group_window_aggregate_function(self): import datetime from pyflink.table.window import Slide t = self.t_env.from_elements( [ (1, 2, 3, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (3, 2, 4, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (2, 1, 2, datetime.datetime(2018, 3, 11, 3, 10, 0, 0)), (1, 3, 1, datetime.datetime(2018, 3, 11, 3, 40, 0, 0)), (1, 8, 5, datetime.datetime(2018, 3, 11, 4, 20, 0, 0)), (2, 3, 6, datetime.datetime(2018, 3, 11, 3, 30, 0, 0)) ], DataTypes.ROW( [DataTypes.FIELD("a", DataTypes.TINYINT()), DataTypes.FIELD("b", DataTypes.SMALLINT()), DataTypes.FIELD("c", DataTypes.INT()), DataTypes.FIELD("rowtime", DataTypes.TIMESTAMP(3))])) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd', 'e'], [ DataTypes.TINYINT(), DataTypes.TIMESTAMP(3), DataTypes.TIMESTAMP(3), DataTypes.FLOAT(), DataTypes.INT() ]) self.t_env.register_table_sink("Results", table_sink) self.t_env.register_function("max_add", udaf(MaxAdd(), result_type=DataTypes.INT(), func_type="pandas")) self.t_env.create_temporary_system_function("mean_udaf", mean_udaf) slide_window = Slide.over(lit(1).hours) \ .every(lit(30).minutes) \ .on(col("rowtime")) \ .alias("w") t.window(slide_window) \ .group_by(t.a, col("w")) \ .select(t.a, col("w").start, col("w").end, mean_udaf(t.b), call("max_add", t.b, t.c, 1)) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 6]", "+I[1, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.5, 7]", "+I[1, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 5.5, 14]", "+I[1, 2018-03-11 04:00:00.0, 2018-03-11 05:00:00.0, 8.0, 14]", "+I[2, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 1.0, 4]", "+I[2, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 10]", "+I[2, 2018-03-11 03:30:00.0, 2018-03-11 04:30:00.0, 3.0, 10]", "+I[3, 2018-03-11 03:00:00.0, 2018-03-11 04:00:00.0, 2.0, 7]", "+I[3, 2018-03-11 02:30:00.0, 2018-03-11 03:30:00.0, 2.0, 7]"])
def test_sql_ddl(self): self.t_env.execute_sql( "create temporary function func1 as " "'pyflink.table.tests.test_udf.add' language python") table = self.t_env.from_elements([(1, 2)]) \ .alias("a", "b") \ .select(expr.call("func1", expr.col("a"), expr.col("b"))) plan = table.explain() self.assertTrue( plan.find("PythonCalc(select=[func1(f0, f1) AS _c0])") >= 0)
def test_left_outer_join_lateral(self): t_env = self.t_env t_env.create_java_temporary_system_function("split", "org.apache.flink.table.utils.TableFunc1") source = t_env.from_elements([("1", "1#3#5#7"), ("2", "2#4#6#8")], ["id", "words"]) result = source.left_outer_join_lateral(expr.call('split', source.words).alias('word')) query_operation = result._j_table.getQueryOperation() self.assertEqual('LEFT_OUTER', query_operation.getJoinType().toString()) self.assertTrue(query_operation.isCorrelated()) self.assertEqual('true', query_operation.getCondition().toString())
def test_sliding_group_window_over_count(self): self.t_env.get_config().get_configuration().set_string("parallelism.default", "1") # create source file path tmp_dir = self.tempdir data = [ '1,1,2,2018-03-11 03:10:00', '3,3,2,2018-03-11 03:10:00', '2,2,1,2018-03-11 03:10:00', '1,1,3,2018-03-11 03:40:00', '1,1,8,2018-03-11 04:20:00', '2,2,3,2018-03-11 03:30:00', '3,3,3,2018-03-11 03:30:00' ] source_path = tmp_dir + '/test_sliding_group_window_over_count.csv' with open(source_path, 'w') as fd: for ele in data: fd.write(ele + '\n') self.t_env.register_function("my_sum", SumAggregateFunction()) source_table = """ create table source_table( a TINYINT, b SMALLINT, c SMALLINT, protime as PROCTIME() ) with( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '%s', 'format.ignore-first-line' = 'false', 'format.field-delimiter' = ',' ) """ % source_path self.t_env.execute_sql(source_table) t = self.t_env.from_path("source_table") from pyflink.testing import source_sink_utils table_sink = source_sink_utils.TestAppendSink( ['a', 'd'], [ DataTypes.TINYINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t.window(Slide.over(row_interval(2)).every(row_interval(1)).on(t.protime).alias("w")) \ .group_by(t.a, col("w")) \ .select(t.a, call("my_sum", t.c).alias("b")) \ .execute_insert("Results") \ .wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 5]", "+I[1, 11]", "+I[2, 4]", "+I[3, 5]"])
def test_set_requirements_with_cached_directory(self): tmp_dir = self.tempdir requirements_txt_path = os.path.join( tmp_dir, "requirements_txt_" + str(uuid.uuid4())) with open(requirements_txt_path, 'w') as f: f.write("python-package1==0.0.0") requirements_dir_path = os.path.join( tmp_dir, "requirements_dir_" + str(uuid.uuid4())) os.mkdir(requirements_dir_path) package_file_name = "python-package1-0.0.0.tar.gz" with open(os.path.join(requirements_dir_path, package_file_name), 'wb') as f: import base64 # This base64 data is encoded from a python package file which includes a # "python_package1" module. The module contains a "plus(a, b)" function. # The base64 can be recomputed by following code: # base64.b64encode(open("python-package1-0.0.0.tar.gz", "rb").read()).decode("utf-8") f.write( base64.b64decode( "H4sICNefrV0C/2Rpc3QvcHl0aG9uLXBhY2thZ2UxLTAuMC4wLnRhcgDtmVtv2jAYhnPtX2H1CrRCY+ckI" "XEx7axuUA11u5imyICTRc1JiVnHfv1MKKWjYxwKEdPehws7xkmUfH5f+3PyqfqWpa1cjG5EKFnLbOvfhX" "FQTI3nOPPSdavS5Pa8nGMwy3Esi3ke9wyTObbnGNQxamBSKlFQavzUryG8ldG6frpbEGx4yNmDLMp/hPy" "P8b+6fNN613vdP1z8XdteG3+ug/17/F3Hcw1qIv5H54NUYiyUaH2SRRllaYeytkl6IpEdujI2yH2XapCQ" "wSRJRDHt0OveZa//uUfeZonUvUO5bHo+0ZcoVo9bMhFRvGx9H41kWj447aUsR0WUq+pui8arWKggK5Jli" "wGOo/95q79ovXi6/nfyf246Dof/n078fT9KI+X77Xx6BP83bX4Xf5NxT7dz7toO/L8OxjKgeTwpG+KcDp" "sdQjWFVJMipYI+o0MCk4X/t2UYtqI0yPabCHb3f861XcD/Ty/+Y5nLdCzT0dSPo/SmbKsf6un+b7KV+Ls" "W4/D/OoC9w/930P9eGwM75//csrD+Q/6P/P/k9D/oX3988Wqw1bS/tf6tR+s/m3EG/ddBqXO9XKf15C8p" "P9k4HZBtBgzZaVW5vrfKcj+W32W82ygEB9D/Xu9+4/qfP9L/rBv0X1v87yONKRX61/qfzwqjIDzIPTbv/" "7or3/88i0H/tfBFW7s/s/avRInQH06ieEy7tDrQeYHUdRN7wP+n/vf62LOH/pld7f9xz7a5Pfufedy0oP" "86iJI8KxStAq6yLC4JWdbbVbWRikR2z1ZGytk5vauW3QdnBFE6XqwmykazCesAAAAAAAAAAAAAAAAAAAA" "AAAAAAAAAAAAAAOBw/AJw5CHBAFAAAA==")) self.st_env.set_python_requirements(requirements_txt_path, requirements_dir_path) def add_one(i): from python_package1 import plus return plus(i, 1) self.st_env.create_temporary_system_function( "add_one", udf(add_one, DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) self.st_env.register_table_sink("Results", table_sink) t = self.st_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(expr.call('add_one', t.a), t.a).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[2, 1]", "+I[3, 2]", "+I[4, 3]"])
def test_join_lateral_with_join_predicate(self): t_env = self.t_env t_env.create_java_temporary_system_function( "split", "org.apache.flink.table.legacyutils.TableFunc1") source = t_env.from_elements([("1", "1#3#5#7"), ("2", "2#4#6#8")], ["id", "words"]) result = source.join_lateral( expr.call('split', source.words).alias('word'), expr.col('id') == expr.col('word')) query_operation = result._j_table.getQueryOperation() self.assertEqual('INNER', query_operation.getJoinType().toString()) self.assertTrue(query_operation.isCorrelated()) self.assertEqual('equals(id, word)', query_operation.getCondition().toString())
def test_clean_state(self): self.t_env.register_function("my_count", CountAggregateFunction()) self.t_env.get_config().set("parallelism.default", "1") self.t_env.get_config().set("python.fn-execution.bundle.size", "1") self.t_env.get_config().set("python.state.cache-size", "0") self.t_env.get_config().set("table.exec.state.ttl", "2ms") self.t_env.execute_sql(""" CREATE TABLE test_source ( a BIGINT ) WITH ( 'connector' = 'datagen', 'number-of-rows' = '5', 'rows-per-second' = '1' ) """) t = self.t_env.from_path('test_source') t.select(call("my_count", t.a).alias("a")).to_pandas()
def test_add_python_file(self): python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dependency_manage_lib.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") self.t_env.add_python_file(python_file_path) def plus_two(i): from test_dependency_manage_lib import add_two return add_two(i) self.t_env.create_temporary_system_function( "add_two", udf(plus_two, DataTypes.BIGINT(), DataTypes.BIGINT())) t = self.t_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t = t.select(expr.call('add_two', t.a), t.a) result = self.collect(t) self.assertEqual(result, ["3,1", "4,2", "5,3"])
def test_set_requirements_without_cached_directory(self): requirements_txt_path = os.path.join(self.tempdir, str(uuid.uuid4())) with open(requirements_txt_path, 'w') as f: f.write("cloudpickle==2.1.0") self.st_env.set_python_requirements(requirements_txt_path) def check_requirements(i): import cloudpickle # noqa # pylint: disable=unused-import assert '_PYTHON_REQUIREMENTS_INSTALL_DIR' in os.environ return i self.st_env.create_temporary_system_function( "check_requirements", udf(check_requirements, DataTypes.BIGINT(), DataTypes.BIGINT())) sink_table_ddl = """ CREATE TABLE Results(a BIGINT, b BIGINT) WITH ('connector'='test-sink') """ self.st_env.execute_sql(sink_table_ddl) t = self.st_env.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) t.select(expr.call('check_requirements', t.a), t.a).execute_insert("Results").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["+I[1, 1]", "+I[2, 2]", "+I[3, 3]"])
def test_basic_functionality(self): # pandas UDF self.t_env.create_temporary_system_function( "add_one", udf(lambda i: i + 1, result_type=DataTypes.BIGINT(), udf_type="pandas")) self.t_env.create_temporary_system_function("add", add) # general Python UDF self.t_env.create_temporary_system_function( "subtract_one", udf(SubtractOne(), DataTypes.BIGINT(), DataTypes.BIGINT())) table_sink = source_sink_utils.TestAppendSink( ['a', 'b', 'c', 'd'], [DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.BIGINT()]) self.t_env.register_table_sink("Results", table_sink) t = self.t_env.from_elements([(1, 2, 3), (2, 5, 6), (3, 1, 9)], ['a', 'b', 'c']) exec_insert_table( t.where(E.call('add_one', t.b) <= 3) .select("a, b + 1, add(a + 1, subtract_one(c)) + 2, add(add_one(a), 1L)"), "Results") actual = source_sink_utils.results() self.assert_equals(actual, ["1,3,6,3", "3,2,14,5"])
def test_double_aggregate(self): self.t_env.register_function("my_count", CountAggregateFunction()) self.t_env.create_temporary_function("my_sum", SumAggregateFunction()) # trigger the finish bundle more frequently to ensure testing the communication # between RemoteKeyedStateBackend and the StateGrpcService. self.t_env.get_config().set("python.fn-execution.bundle.size", "2") # trigger the cache eviction in a bundle. self.t_env.get_config().set("python.state.cache-size", "1") t = self.t_env.from_elements([(1, 'Hi', 'Hello'), (3, 'Hi', 'hi'), (3, 'Hi2', 'hi'), (3, 'Hi', 'hi2'), (2, 'Hi', 'Hello')], ['a', 'b', 'c']) result = t.group_by(t.c) \ .select(call("my_count", t.a).alias("a"), call("my_sum", t.a).alias("b"), t.c) \ .select(call("my_count", col("a")).alias("a"), call("my_sum", col("b")).alias("b"), call("sum0", col("b")).alias("c"), call("sum0", col("b").cast(DataTypes.DOUBLE())).alias("d")) assert_frame_equal( result.to_pandas(), pd.DataFrame([[3, 12, 12, 12.0]], columns=['a', 'b', 'c', 'd']))
def __call__(self, *args) -> Expression: from pyflink.table import expressions as expr return expr.call(self, *args)
def test_all_data_types(self): def boolean_func(bool_param): assert isinstance(bool_param, bool), 'bool_param of wrong type %s !' \ % type(bool_param) return bool_param def tinyint_func(tinyint_param): assert isinstance(tinyint_param, int), 'tinyint_param of wrong type %s !' \ % type(tinyint_param) return tinyint_param def smallint_func(smallint_param): assert isinstance(smallint_param, int), 'smallint_param of wrong type %s !' \ % type(smallint_param) assert smallint_param == 32767, 'smallint_param of wrong value %s' % smallint_param return smallint_param def int_func(int_param): assert isinstance(int_param, int), 'int_param of wrong type %s !' \ % type(int_param) assert int_param == -2147483648, 'int_param of wrong value %s' % int_param return int_param def bigint_func(bigint_param): assert isinstance(bigint_param, int), 'bigint_param of wrong type %s !' \ % type(bigint_param) return bigint_param def bigint_func_none(bigint_param): assert bigint_param is None, 'bigint_param %s should be None!' % bigint_param return bigint_param def float_func(float_param): assert isinstance(float_param, float) and float_equal(float_param, 1.23, 1e-6), \ 'float_param is wrong value %s !' % float_param return float_param def double_func(double_param): assert isinstance(double_param, float) and float_equal(double_param, 1.98932, 1e-7), \ 'double_param is wrong value %s !' % double_param return double_param def bytes_func(bytes_param): assert bytes_param == b'flink', \ 'bytes_param is wrong value %s !' % bytes_param return bytes_param def str_func(str_param): assert str_param == 'pyflink', \ 'str_param is wrong value %s !' % str_param return str_param def date_func(date_param): from datetime import date assert date_param == date(year=2014, month=9, day=13), \ 'date_param is wrong value %s !' % date_param return date_param def time_func(time_param): from datetime import time assert time_param == time(hour=12, minute=0, second=0, microsecond=123000), \ 'time_param is wrong value %s !' % time_param return time_param def timestamp_func(timestamp_param): from datetime import datetime assert timestamp_param == datetime(2018, 3, 11, 3, 0, 0, 123000), \ 'timestamp_param is wrong value %s !' % timestamp_param return timestamp_param def array_func(array_param): assert array_param == [[1, 2, 3]], \ 'array_param is wrong value %s !' % array_param return array_param[0] def map_func(map_param): assert map_param == {1: 'flink', 2: 'pyflink'}, \ 'map_param is wrong value %s !' % map_param return map_param def decimal_func(decimal_param): from decimal import Decimal assert decimal_param == Decimal('1000000000000000000.050000000000000000'), \ 'decimal_param is wrong value %s !' % decimal_param return decimal_param def decimal_cut_func(decimal_param): from decimal import Decimal assert decimal_param == Decimal('1000000000000000000.059999999999999999'), \ 'decimal_param is wrong value %s !' % decimal_param return decimal_param self.t_env.create_temporary_system_function( "boolean_func", udf(boolean_func, result_type=DataTypes.BOOLEAN())) self.t_env.create_temporary_system_function( "tinyint_func", udf(tinyint_func, result_type=DataTypes.TINYINT())) self.t_env.create_temporary_system_function( "smallint_func", udf(smallint_func, result_type=DataTypes.SMALLINT())) self.t_env.create_temporary_system_function( "int_func", udf(int_func, result_type=DataTypes.INT())) self.t_env.create_temporary_system_function( "bigint_func", udf(bigint_func, result_type=DataTypes.BIGINT())) self.t_env.create_temporary_system_function( "bigint_func_none", udf(bigint_func_none, result_type=DataTypes.BIGINT())) self.t_env.create_temporary_system_function( "float_func", udf(float_func, result_type=DataTypes.FLOAT())) self.t_env.create_temporary_system_function( "double_func", udf(double_func, result_type=DataTypes.DOUBLE())) self.t_env.create_temporary_system_function( "bytes_func", udf(bytes_func, result_type=DataTypes.BYTES())) self.t_env.create_temporary_system_function( "str_func", udf(str_func, result_type=DataTypes.STRING())) self.t_env.create_temporary_system_function( "date_func", udf(date_func, result_type=DataTypes.DATE())) self.t_env.create_temporary_system_function( "time_func", udf(time_func, result_type=DataTypes.TIME())) self.t_env.create_temporary_system_function( "timestamp_func", udf(timestamp_func, result_type=DataTypes.TIMESTAMP(3))) self.t_env.create_temporary_system_function( "array_func", udf(array_func, result_type=DataTypes.ARRAY(DataTypes.BIGINT()))) self.t_env.create_temporary_system_function( "map_func", udf(map_func, result_type=DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING()))) self.t_env.register_function( "decimal_func", udf(decimal_func, result_type=DataTypes.DECIMAL(38, 18))) self.t_env.register_function( "decimal_cut_func", udf(decimal_cut_func, result_type=DataTypes.DECIMAL(38, 18))) table_sink = source_sink_utils.TestAppendSink([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q' ], [ DataTypes.BIGINT(), DataTypes.BIGINT(), DataTypes.TINYINT(), DataTypes.BOOLEAN(), DataTypes.SMALLINT(), DataTypes.INT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.BYTES(), DataTypes.STRING(), DataTypes.DATE(), DataTypes.TIME(), DataTypes.TIMESTAMP(3), DataTypes.ARRAY(DataTypes.BIGINT()), DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING()), DataTypes.DECIMAL(38, 18), DataTypes.DECIMAL(38, 18) ]) self.t_env.register_table_sink("Results", table_sink) import datetime import decimal t = self.t_env.from_elements( [(1, None, 1, True, 32767, -2147483648, 1.23, 1.98932, bytearray(b'flink'), 'pyflink', datetime.date(2014, 9, 13), datetime.time(hour=12, minute=0, second=0, microsecond=123000), datetime.datetime(2018, 3, 11, 3, 0, 0, 123000), [[1, 2, 3]], { 1: 'flink', 2: 'pyflink' }, decimal.Decimal('1000000000000000000.05'), decimal.Decimal( '1000000000000000000.05999999999999999899999999999'))], DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.BIGINT()), DataTypes.FIELD("b", DataTypes.BIGINT()), DataTypes.FIELD("c", DataTypes.TINYINT()), DataTypes.FIELD("d", DataTypes.BOOLEAN()), DataTypes.FIELD("e", DataTypes.SMALLINT()), DataTypes.FIELD("f", DataTypes.INT()), DataTypes.FIELD("g", DataTypes.FLOAT()), DataTypes.FIELD("h", DataTypes.DOUBLE()), DataTypes.FIELD("i", DataTypes.BYTES()), DataTypes.FIELD("j", DataTypes.STRING()), DataTypes.FIELD("k", DataTypes.DATE()), DataTypes.FIELD("l", DataTypes.TIME()), DataTypes.FIELD("m", DataTypes.TIMESTAMP(3)), DataTypes.FIELD( "n", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.BIGINT()))), DataTypes.FIELD( "o", DataTypes.MAP(DataTypes.BIGINT(), DataTypes.STRING())), DataTypes.FIELD("p", DataTypes.DECIMAL(38, 18)), DataTypes.FIELD("q", DataTypes.DECIMAL(38, 18)) ])) t.select(call("bigint_func", t.a), call("bigint_func_none", t.b), call("tinyint_func", t.c), call("boolean_func", t.d), call("smallint_func", t.e), call("int_func", t.f), call("float_func", t.g), call("double_func", t.h), call("bytes_func", t.i), call("str_func", t.j), call("date_func", t.k), call("time_func", t.l), call("timestamp_func", t.m), call("array_func", t.n), call("map_func", t.o), call("decimal_func", t.p), call("decimal_cut_func", t.q)) \ .execute_insert("Results").wait() actual = source_sink_utils.results() # Currently the sink result precision of DataTypes.TIME(precision) only supports 0. self.assert_equals(actual, [ "+I[1, null, 1, true, 32767, -2147483648, 1.23, 1.98932, " "[102, 108, 105, 110, 107], pyflink, 2014-09-13, " "12:00:00, 2018-03-11 03:00:00.123, [1, 2, 3], " "{1=flink, 2=pyflink}, 1000000000000000000.050000000000000000, " "1000000000000000000.059999999999999999]" ])