def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) # used to test pipeline.jars and pipeline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().set(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) t_env.execute_sql( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t = t_env.from_elements(elements, ["word", "count"]) t.select(t.word, call("add_one", t.count).alias("count"), call("add_one_java", t.count).alias("count_java")) \ .group_by(t.word) \ .select(t.word, col("count").count.alias("count"), col("count_java").count.alias("count_java")) \ .execute_insert("Results")
def process_json_data(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) table = table.select( col('id'), col('data').json_value('$.addr.country', DataTypes.STRING())) # execute table.execute_insert('sink') \ .wait()
def sql_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) t_env.sql_query("SELECT * FROM %s" % table) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | id | data | # +----+----------------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | # | +I | 2 | {"name": "hello", "tel": 13... | # | +I | 3 | {"name": "world", "tel": 12... | # | +I | 4 | {"name": "PyFlink", "tel": ... | # +----+----------------------+--------------------------------+ # execute sql statement @udtf( result_types=[DataTypes.STRING(), DataTypes.INT(), DataTypes.STRING()]) def parse_data(data: str): json_data = json.loads(data) yield json_data['name'], json_data['tel'], json_data['addr']['country'] t_env.create_temporary_function('parse_data', parse_data) t_env.execute_sql(""" SELECT * FROM %s, LATERAL TABLE(parse_data(`data`)) t(name, tel, country) """ % table).print() # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+ # | op | id | data | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | Flink | 123 | Germany | # | +I | 2 | {"name": "hello", "tel": 13... | hello | 135 | China | # | +I | 3 | {"name": "world", "tel": 12... | world | 124 | USA | # | +I | 4 | {"name": "PyFlink", "tel": ... | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+ # explain sql plan print( t_env.explain_sql(""" SELECT * FROM %s, LATERAL TABLE(parse_data(`data`)) t(name, tel, country) """ % table))
def setUp(self): super(StreamDependencyTests, self).setUp() origin_execution_mode = os.environ['_python_worker_execution_mode'] os.environ['_python_worker_execution_mode'] = "loopback" try: self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) finally: if origin_execution_mode is not None: os.environ['_python_worker_execution_mode'] = origin_execution_mode
def conversion_from_dataframe(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) t_env.get_config().set("parallelism.default", "1") # define the source with watermark definition pdf = pd.DataFrame(np.random.rand(1000, 2)) table = t_env.from_pandas(pdf, schema=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.DOUBLE()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ])) print(table.to_pandas())
def word_count(input_path, output_path): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # write all the data to one file t_env.get_config().get_configuration().set_string("parallelism.default", "1") # define the source if input_path is not None: t_env.create_temporary_table( 'source', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column( 'word', DataTypes.STRING()).build()).option( 'path', input_path).format('csv').build()) tab = t_env.from_path('source') else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") tab = t_env.from_elements( map(lambda i: (i, ), word_count_data), DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())])) # define the sink if output_path is not None: t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).option('path', output_path). format(FormatDescriptor.for_format('canal-json').build()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).build()) @udtf(result_types=[DataTypes.STRING()]) def split(line: Row): for s in line[0].split(): yield Row(s) # compute word count tab.flat_map(split).alias('word') \ .group_by(col('word')) \ .select(col('word'), lit(1).count) \ .execute_insert('sink') \ .wait()
def word_count(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" with open(source_path, 'w') as f: for word in content.split(" "): f.write(",".join([word, "1"])) f.write("\n") f.flush() f.close() t_config = TableConfig.Builder().as_batch_execution().set_parallelism(1).build() t_env = TableEnvironment.create(t_config) field_names = ["word", "cout"] field_types = [DataTypes.STRING, DataTypes.LONG] # register Orders table in table environment t_env.register_table_source( "Word", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink( "Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Word") \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def word_count(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" with open(source_path, 'w') as f: for word in content.split(" "): f.write(",".join([word, "1"])) f.write("\n") f.flush() f.close() t_config = TableConfig.Builder().as_batch_execution().set_parallelism( 1).build() t_env = TableEnvironment.create(t_config) field_names = ["word", "cout"] field_types = [DataTypes.STRING, DataTypes.LONG] # register Orders table in table environment t_env.register_table_source( "Word", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink("Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Word") \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def multi_sink(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements(elements=[(1, 'Hello'), (2, 'World'), (3, "Flink"), (4, "PyFlink")], schema=['id', 'data']) # define the sink tables t_env.execute_sql(""" CREATE TABLE first_sink ( id BIGINT, data VARCHAR ) WITH ( 'connector' = 'print' ) """) t_env.execute_sql(""" CREATE TABLE second_sink ( id BIGINT, data VARCHAR ) WITH ( 'connector' = 'print' ) """) # create a statement set statement_set = t_env.create_statement_set() # emit the data with id <= 3 to the "first_sink" via sql statement statement_set.add_insert_sql( "INSERT INTO first_sink SELECT * FROM %s WHERE id <= 3" % table) # emit the data which contains "Flink" to the "second_sink" @udf(result_type=DataTypes.BOOLEAN()) def contains_flink(data): return "Flink" in data second_table = table.where(contains_flink(table.data)) statement_set.add_insert("second_sink", second_table) # execute the statement set # remove .wait if submitting to a remote cluster, refer to # https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/python/faq/#wait-for-jobs-to-finish-when-executing-jobs-in-mini-cluster # for more details statement_set.execute().wait()
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" env_settings = EnvironmentSettings.new_instance().in_batch_mode( ).use_blink_planner().build() t_env = TableEnvironment.create(environment_settings=env_settings) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig.Builder().as_batch_execution().build() t_env = TableEnvironment.create(t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def test_end_to_end(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() _find_flink_home() print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"]) t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build() t_env = TableEnvironment.get_table_environment(t_config) field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink( "Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Orders") \ .where("a > 0") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
def pandas_udf_demo(): # Init environment environment_settings = EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build() t_env = TableEnvironment.create(environment_settings=environment_settings) t_env.get_config().get_configuration().set_string('parallelism.default', '1') # Define and register UDF add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT(), udf_type="pandas") # Register Source t_env.execute_sql(""" CREATE TABLE mySource ( a BIGINT, b BIGINT ) WITH ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '/opt/examples/table/input/udf_add_input' ) """) # Register Sink t_env.execute_sql(""" CREATE TABLE mySink ( `sum` BIGINT ) WITH ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '/opt/examples/table/output/pandas_udf_add_output' ) """) # Query t_env.from_path('mySource')\ .select(add(col('a'), col('b'))) \ .insert_into('mySink') t_env.execute("5-pandas_udf_add")
def log_processing(): env_settings = EnvironmentSettings.new_instance().use_blink_planner( ).is_streaming_mode().build() t_env = TableEnvironment.create(env_settings) # specify connector and format jars t_env.get_config().get_configuration().set_string( "pipeline.jars", "file:///my/jar/path/connector.jar;file:///my/jar/path/json.jar") source_ddl = """ CREATE TABLE source_table( a VARCHAR, b SERIAL ) WITH ( 'connector' = 'kafka', 'topic' = 'topic', 'properties.bootstrap.servers' = 'localhost:9092', 'properties.group.id' = 'testGroup' 'scan.startup.mode' = 'latest-offset', 'format' = 'json' ) """ sink_ddl = """ CREATE TABLE sink_table( a VARCHAR ) WITH ( 'connector' = 'kafka', 'topic' = 'topic2', 'properties.bootstrap.servers' = 'kafka:9092', 'format' = 'json' ) """ t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) t_env.sql_query("SELECT a FROM source_table") \ .execute_insert("sink_table").wait()
def process_json_data_with_udf(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) # update json columns @udf(result_type=DataTypes.STRING()) def update_tel(data): json_data = json.loads(data) json_data['tel'] += 1 return json.dumps(json_data) table = table.select(table.id, update_tel(table.data)) # execute table.execute_insert('sink') \ .wait()
def setUp(self): super(PyFlinkStreamTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build() self.t_env = TableEnvironment.create(self.t_config)
def setUp(self): super(StreamDependencyTests, self).setUp() self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) self.st_env._execution_mode = "loopback"
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_batch_execution().set_parallelism(4).build() self.t_env = TableEnvironment.get_table_environment(self.t_config)
def column_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # add columns table = table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | data | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | Flink | 123 | Germany | # | +I | 2 | {"name": "hello", "tel": 13... | hello | 135 | China | # | +I | 3 | {"name": "world", "tel": 12... | world | 124 | USA | # | +I | 4 | {"name": "PyFlink", "tel": ... | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # drop columns table = table.drop_columns(col('data')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # rename columns table = table.rename_columns(col('tel').alias('telephone')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | telephone | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # replace columns table = table.add_or_replace_columns( concat(col('id').cast(DataTypes.STRING()), '_', col('name')).alias('id')) table.execute().print()
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_batch_execution( ).set_parallelism(1).build() self.t_env = TableEnvironment.create(self.t_config)
def basic_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) right_table = t_env.from_elements(elements=[(1, 18), (2, 30), (3, 25), (4, 10)], schema=['id', 'age']) table = table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .drop_columns(col('data')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # limit the number of outputs table.limit(3).execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # filter table.filter(col('id') != 3).execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # aggregation table.group_by(col('country')) \ .select(col('country'), col('id').count, col('tel').cast(DataTypes.BIGINT()).max) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | EXPR$0 | EXPR$1 | # +----+--------------------------------+----------------------+----------------------+ # | +I | Germany | 1 | 123 | # | +I | USA | 1 | 124 | # | +I | China | 1 | 135 | # | -U | China | 1 | 135 | # | +U | China | 2 | 135 | # +----+--------------------------------+----------------------+----------------------+ # distinct table.select(col('country')).distinct() \ .execute().print() # +----+--------------------------------+ # | op | country | # +----+--------------------------------+ # | +I | Germany | # | +I | China | # | +I | USA | # +----+--------------------------------+ # join # Note that it still doesn't support duplicate column names between the joined tables table.join(right_table.rename_columns(col('id').alias('r_id')), col('id') == col('r_id')) \ .execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # | op | id | name | tel | country | r_id | age | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # | +I | 4 | PyFlink | 32 | China | 4 | 10 | # | +I | 1 | Flink | 123 | Germany | 1 | 18 | # | +I | 2 | hello | 135 | China | 2 | 30 | # | +I | 3 | world | 124 | USA | 3 | 25 | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # join lateral @udtf(result_types=[DataTypes.STRING()]) def split(r: Row): for s in r.name.split("i"): yield s table.join_lateral(split.alias('a')) \ .execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | a | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | Fl | # | +I | 1 | Flink | 123 | Germany | nk | # | +I | 2 | hello | 135 | China | hello | # | +I | 3 | world | 124 | USA | world | # | +I | 4 | PyFlink | 32 | China | PyFl | # | +I | 4 | PyFlink | 32 | China | nk | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # show schema table.print_schema() # ( # `id` BIGINT, # `name` STRING, # `tel` STRING, # `country` STRING # ) # show execute plan print(table.join_lateral(split.alias('a')).explain())
def row_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # map operation @udf(result_type=DataTypes.ROW([ DataTypes.FIELD("id", DataTypes.BIGINT()), DataTypes.FIELD("country", DataTypes.STRING()) ])) def extract_country(input_row: Row): data = json.loads(input_row.data) return Row(input_row.id, data['addr']['country']) table.map(extract_country) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | id | country | # +----+----------------------+--------------------------------+ # | +I | 1 | Germany | # | +I | 2 | China | # | +I | 3 | China | # | +I | 4 | China | # +----+----------------------+--------------------------------+ # flat_map operation @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()]) def extract_city(input_row: Row): data = json.loads(input_row.data) yield input_row.id, data['addr']['city'] table.flat_map(extract_city) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | f0 | f1 | # +----+----------------------+--------------------------------+ # | +I | 1 | Berlin | # | +I | 2 | Shanghai | # | +I | 3 | NewYork | # | +I | 4 | Hangzhou | # +----+----------------------+--------------------------------+ # aggregate operation class CountAndSumAggregateFunction(AggregateFunction): def get_value(self, accumulator): return Row(accumulator[0], accumulator[1]) def create_accumulator(self): return Row(0, 0) def accumulate(self, accumulator, input_row): accumulator[0] += 1 accumulator[1] += int(input_row.tel) def retract(self, accumulator, input_row): accumulator[0] -= 1 accumulator[1] -= int(input_row.tel) def merge(self, accumulator, accumulators): for other_acc in accumulators: accumulator[0] += other_acc[0] accumulator[1] += other_acc[1] def get_accumulator_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) def get_result_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) count_sum = udaf(CountAndSumAggregateFunction()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .aggregate(count_sum.alias("cnt", "sum")) \ .select(col('country'), col('cnt'), col('sum')) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | cnt | sum | # +----+--------------------------------+----------------------+----------------------+ # | +I | China | 3 | 291 | # | +I | Germany | 1 | 123 | # +----+--------------------------------+----------------------+----------------------+ # flat_aggregate operation class Top2(TableAggregateFunction): def emit_value(self, accumulator): for v in accumulator: if v: yield Row(v) def create_accumulator(self): return [None, None] def accumulate(self, accumulator, input_row): tel = int(input_row.tel) if accumulator[0] is None or tel > accumulator[0]: accumulator[1] = accumulator[0] accumulator[0] = tel elif accumulator[1] is None or tel > accumulator[1]: accumulator[1] = tel def get_accumulator_type(self): return DataTypes.ARRAY(DataTypes.BIGINT()) def get_result_type(self): return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())]) top2 = udtaf(Top2()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .flat_aggregate(top2) \ .select(col('country'), col('tel')) \ .execute().print()
from pyflink.table import DataTypes, EnvironmentSettings, TableEnvironment from pyflink.table.expressions import col from pyflink.table.udf import udf env_settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() t_env = TableEnvironment.create(environment_settings=env_settings) t_env.get_config().get_configuration().set_string('parallelism.default', '1') @udf(input_types=[DataTypes.BIGINT(), DataTypes.BIGINT()], result_type=DataTypes.BIGINT()) def add(i, j): from mpmath import fadd # add third-party dependency return int(fadd(1, 2)) t_env.set_python_requirements("/opt/examples/table/data/requirements.txt") t_env.execute_sql(""" CREATE TABLE mySource ( a BIGINT, b BIGINT ) WITH ( 'connector' = 'filesystem', 'format' = 'csv', 'path' = '/opt/examples/table/input/udf_add_input' ) """) t_env.execute_sql("""
def setUp(self): super(PyFlinkStreamTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_streaming_execution( ).set_parallelism(4).build() self.t_env = TableEnvironment.get_table_environment(self.t_config)