def sql_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) t_env.sql_query("SELECT * FROM %s" % table) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | id | data | # +----+----------------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | # | +I | 2 | {"name": "hello", "tel": 13... | # | +I | 3 | {"name": "world", "tel": 12... | # | +I | 4 | {"name": "PyFlink", "tel": ... | # +----+----------------------+--------------------------------+ # execute sql statement @udtf( result_types=[DataTypes.STRING(), DataTypes.INT(), DataTypes.STRING()]) def parse_data(data: str): json_data = json.loads(data) yield json_data['name'], json_data['tel'], json_data['addr']['country'] t_env.create_temporary_function('parse_data', parse_data) t_env.execute_sql(""" SELECT * FROM %s, LATERAL TABLE(parse_data(`data`)) t(name, tel, country) """ % table).print() # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+ # | op | id | data | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | Flink | 123 | Germany | # | +I | 2 | {"name": "hello", "tel": 13... | hello | 135 | China | # | +I | 3 | {"name": "world", "tel": 12... | world | 124 | USA | # | +I | 4 | {"name": "PyFlink", "tel": ... | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+-------------+--------------------------------+ # explain sql plan print( t_env.explain_sql(""" SELECT * FROM %s, LATERAL TABLE(parse_data(`data`)) t(name, tel, country) """ % table))
def process_json_data(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) table = table.select( col('id'), col('data').json_value('$.addr.country', DataTypes.STRING())) # execute table.execute_insert('sink') \ .wait()
def setUp(self): super(StreamDependencyTests, self).setUp() origin_execution_mode = os.environ['_python_worker_execution_mode'] os.environ['_python_worker_execution_mode'] = "loopback" try: self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) finally: if origin_execution_mode is not None: os.environ['_python_worker_execution_mode'] = origin_execution_mode
def conversion_from_dataframe(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) t_env.get_config().set("parallelism.default", "1") # define the source with watermark definition pdf = pd.DataFrame(np.random.rand(1000, 2)) table = t_env.from_pandas(pdf, schema=DataTypes.ROW([ DataTypes.FIELD("a", DataTypes.DOUBLE()), DataTypes.FIELD("b", DataTypes.DOUBLE()) ])) print(table.to_pandas())
def test_add_python_file(self): import uuid env = self.env python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dep1.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") def plus_two_map(value): from test_dep1 import add_two return add_two(value) get_j_env_configuration(env._j_stream_execution_environment).\ setString("taskmanager.numberOfTaskSlots", "10") env.add_python_file(python_file_path) ds = env.from_collection([1, 2, 3, 4, 5]) ds = ds.map(plus_two_map, Types.LONG()) \ .slot_sharing_group("data_stream") \ .map(lambda i: i, Types.LONG()) \ .slot_sharing_group("table") python_file_path = os.path.join(python_file_dir, "test_dep2.py") with open(python_file_path, 'w') as f: f.write("def add_three(a):\n return a + 3") def plus_three(value): from test_dep2 import add_three return add_three(value) t_env = StreamTableEnvironment.create( stream_execution_environment=env, environment_settings=EnvironmentSettings.in_streaming_mode()) env.add_python_file(python_file_path) from pyflink.table.udf import udf from pyflink.table.expressions import col add_three = udf(plus_three, result_type=DataTypes.BIGINT()) tab = t_env.from_data_stream(ds, col('a')) \ .select(add_three(col('a'))) t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \ .map(lambda i: i[0]) \ .add_sink(self.test_sink) env.execute("test add_python_file") result = self.test_sink.get_results(True) expected = ['6', '7', '8', '9', '10'] result.sort() expected.sort() self.assertEqual(expected, result)
def word_count(input_path, output_path): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # write all the data to one file t_env.get_config().get_configuration().set_string("parallelism.default", "1") # define the source if input_path is not None: t_env.create_temporary_table( 'source', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column( 'word', DataTypes.STRING()).build()).option( 'path', input_path).format('csv').build()) tab = t_env.from_path('source') else: print("Executing word_count example with default input data set.") print("Use --input to specify file input.") tab = t_env.from_elements( map(lambda i: (i, ), word_count_data), DataTypes.ROW([DataTypes.FIELD('line', DataTypes.STRING())])) # define the sink if output_path is not None: t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('filesystem').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).option('path', output_path). format(FormatDescriptor.for_format('canal-json').build()).build()) else: print( "Printing result to stdout. Use --output to specify output path.") t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('word', DataTypes.STRING()).column( 'count', DataTypes.BIGINT()).build()).build()) @udtf(result_types=[DataTypes.STRING()]) def split(line: Row): for s in line[0].split(): yield Row(s) # compute word count tab.flat_map(split).alias('word') \ .group_by(col('word')) \ .select(col('word'), lit(1).count) \ .execute_insert('sink') \ .wait()
def test_to_retract_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements([(1, "Hi", "Hello"), (1, "Hi", "Hello")], ["a", "b", "c"]) new_table = table.group_by("c").select("a.sum, c as b") ds = t_env.to_retract_stream(table=new_table, type_info=Types.ROW([Types.LONG(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.map(lambda x: x).add_sink(test_sink) self.env.execute("test_to_retract_stream") result = test_sink.get_results(True) expected = ["(True, Row(f0=1, f1='Hello'))", "(False, Row(f0=1, f1='Hello'))", "(True, Row(f0=2, f1='Hello'))"] self.assertEqual(result, expected)
def test_to_append_stream(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements([(1, "Hi", "Hello"), (2, "Hello", "Hi")], ["a", "b", "c"]) new_table = table.select("a + 1, b + 'flink', c") ds = t_env.to_append_stream(table=new_table, type_info=Types.ROW([Types.LONG(), Types.STRING(), Types.STRING()])) test_sink = DataStreamTestSinkFunction() ds.add_sink(test_sink) self.env.execute("test_to_append_stream") result = test_sink.get_results(False) expected = ['+I[2, Hiflink, Hello]', '+I[3, Helloflink, Hi]'] self.assertEqual(result, expected)
def multi_sink(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) table = t_env.from_elements(elements=[(1, 'Hello'), (2, 'World'), (3, "Flink"), (4, "PyFlink")], schema=['id', 'data']) # define the sink tables t_env.execute_sql(""" CREATE TABLE first_sink ( id BIGINT, data VARCHAR ) WITH ( 'connector' = 'print' ) """) t_env.execute_sql(""" CREATE TABLE second_sink ( id BIGINT, data VARCHAR ) WITH ( 'connector' = 'print' ) """) # create a statement set statement_set = t_env.create_statement_set() # emit the data with id <= 3 to the "first_sink" via sql statement statement_set.add_insert_sql( "INSERT INTO first_sink SELECT * FROM %s WHERE id <= 3" % table) # emit the data which contains "Flink" to the "second_sink" @udf(result_type=DataTypes.BOOLEAN()) def contains_flink(data): return "Flink" in data second_table = table.where(contains_flink(table.data)) statement_set.add_insert("second_sink", second_table) # execute the statement set # remove .wait if submitting to a remote cluster, refer to # https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/python/faq/#wait-for-jobs-to-finish-when-executing-jobs-in-mini-cluster # for more details statement_set.execute().wait()
def test_mode_selection(self): builder = EnvironmentSettings.new_instance() # test the default behaviour to make sure it is consistent with the python doc environment_settings = builder.build() self.assertTrue(environment_settings.is_streaming_mode()) # test in_streaming_mode environment_settings = builder.in_streaming_mode().build() self.assertTrue(environment_settings.is_streaming_mode()) environment_settings = EnvironmentSettings.in_streaming_mode() self.assertTrue(environment_settings.is_streaming_mode()) # test in_batch_mode environment_settings = builder.in_batch_mode().build() self.assertFalse(environment_settings.is_streaming_mode()) environment_settings = EnvironmentSettings.in_batch_mode() self.assertFalse(environment_settings.is_streaming_mode())
def process_json_data_with_udf(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print').schema( Schema.new_builder().column('id', DataTypes.BIGINT()).column( 'data', DataTypes.STRING()).build()).build()) # update json columns @udf(result_type=DataTypes.STRING()) def update_tel(data): json_data = json.loads(data) json_data['tel'] += 1 return json.dumps(json_data) table = table.select(table.id, update_tel(table.data)) # execute table.execute_insert('sink') \ .wait()
def setUp(self): super(StreamDependencyTests, self).setUp() self.st_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) self.st_env._execution_mode = "loopback"
def row_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "China", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # map operation @udf(result_type=DataTypes.ROW([ DataTypes.FIELD("id", DataTypes.BIGINT()), DataTypes.FIELD("country", DataTypes.STRING()) ])) def extract_country(input_row: Row): data = json.loads(input_row.data) return Row(input_row.id, data['addr']['country']) table.map(extract_country) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | id | country | # +----+----------------------+--------------------------------+ # | +I | 1 | Germany | # | +I | 2 | China | # | +I | 3 | China | # | +I | 4 | China | # +----+----------------------+--------------------------------+ # flat_map operation @udtf(result_types=[DataTypes.BIGINT(), DataTypes.STRING()]) def extract_city(input_row: Row): data = json.loads(input_row.data) yield input_row.id, data['addr']['city'] table.flat_map(extract_city) \ .execute().print() # +----+----------------------+--------------------------------+ # | op | f0 | f1 | # +----+----------------------+--------------------------------+ # | +I | 1 | Berlin | # | +I | 2 | Shanghai | # | +I | 3 | NewYork | # | +I | 4 | Hangzhou | # +----+----------------------+--------------------------------+ # aggregate operation class CountAndSumAggregateFunction(AggregateFunction): def get_value(self, accumulator): return Row(accumulator[0], accumulator[1]) def create_accumulator(self): return Row(0, 0) def accumulate(self, accumulator, input_row): accumulator[0] += 1 accumulator[1] += int(input_row.tel) def retract(self, accumulator, input_row): accumulator[0] -= 1 accumulator[1] -= int(input_row.tel) def merge(self, accumulator, accumulators): for other_acc in accumulators: accumulator[0] += other_acc[0] accumulator[1] += other_acc[1] def get_accumulator_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) def get_result_type(self): return DataTypes.ROW([ DataTypes.FIELD("cnt", DataTypes.BIGINT()), DataTypes.FIELD("sum", DataTypes.BIGINT()) ]) count_sum = udaf(CountAndSumAggregateFunction()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .aggregate(count_sum.alias("cnt", "sum")) \ .select(col('country'), col('cnt'), col('sum')) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | cnt | sum | # +----+--------------------------------+----------------------+----------------------+ # | +I | China | 3 | 291 | # | +I | Germany | 1 | 123 | # +----+--------------------------------+----------------------+----------------------+ # flat_aggregate operation class Top2(TableAggregateFunction): def emit_value(self, accumulator): for v in accumulator: if v: yield Row(v) def create_accumulator(self): return [None, None] def accumulate(self, accumulator, input_row): tel = int(input_row.tel) if accumulator[0] is None or tel > accumulator[0]: accumulator[1] = accumulator[0] accumulator[0] = tel elif accumulator[1] is None or tel > accumulator[1]: accumulator[1] = tel def get_accumulator_type(self): return DataTypes.ARRAY(DataTypes.BIGINT()) def get_result_type(self): return DataTypes.ROW([DataTypes.FIELD("tel", DataTypes.BIGINT())]) top2 = udtaf(Top2()) table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .group_by(col('country')) \ .flat_aggregate(top2) \ .select(col('country'), col('tel')) \ .execute().print()
def basic_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) right_table = t_env.from_elements(elements=[(1, 18), (2, 30), (3, 25), (4, 10)], schema=['id', 'age']) table = table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) \ .drop_columns(col('data')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # limit the number of outputs table.limit(3).execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # filter table.filter(col('id') != 3).execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # aggregation table.group_by(col('country')) \ .select(col('country'), col('id').count, col('tel').cast(DataTypes.BIGINT()).max) \ .execute().print() # +----+--------------------------------+----------------------+----------------------+ # | op | country | EXPR$0 | EXPR$1 | # +----+--------------------------------+----------------------+----------------------+ # | +I | Germany | 1 | 123 | # | +I | USA | 1 | 124 | # | +I | China | 1 | 135 | # | -U | China | 1 | 135 | # | +U | China | 2 | 135 | # +----+--------------------------------+----------------------+----------------------+ # distinct table.select(col('country')).distinct() \ .execute().print() # +----+--------------------------------+ # | op | country | # +----+--------------------------------+ # | +I | Germany | # | +I | China | # | +I | USA | # +----+--------------------------------+ # join # Note that it still doesn't support duplicate column names between the joined tables table.join(right_table.rename_columns(col('id').alias('r_id')), col('id') == col('r_id')) \ .execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # | op | id | name | tel | country | r_id | age | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # | +I | 4 | PyFlink | 32 | China | 4 | 10 | # | +I | 1 | Flink | 123 | Germany | 1 | 18 | # | +I | 2 | hello | 135 | China | 2 | 30 | # | +I | 3 | world | 124 | USA | 3 | 25 | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+----------------------+----------------------+ # join lateral @udtf(result_types=[DataTypes.STRING()]) def split(r: Row): for s in r.name.split("i"): yield s table.join_lateral(split.alias('a')) \ .execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | a | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | Fl | # | +I | 1 | Flink | 123 | Germany | nk | # | +I | 2 | hello | 135 | China | hello | # | +I | 3 | world | 124 | USA | world | # | +I | 4 | PyFlink | 32 | China | PyFl | # | +I | 4 | PyFlink | 32 | China | nk | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # show schema table.print_schema() # ( # `id` BIGINT, # `name` STRING, # `tel` STRING, # `country` STRING # ) # show execute plan print(table.join_lateral(split.alias('a')).explain())
def column_operations(): t_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) # define the source table = t_env.from_elements(elements=[ (1, '{"name": "Flink", "tel": 123, "addr": {"country": "Germany", "city": "Berlin"}}' ), (2, '{"name": "hello", "tel": 135, "addr": {"country": "China", "city": "Shanghai"}}' ), (3, '{"name": "world", "tel": 124, "addr": {"country": "USA", "city": "NewYork"}}' ), (4, '{"name": "PyFlink", "tel": 32, "addr": {"country": "China", "city": "Hangzhou"}}' ) ], schema=['id', 'data']) # add columns table = table.add_columns( col('data').json_value('$.name', DataTypes.STRING()).alias('name'), col('data').json_value('$.tel', DataTypes.STRING()).alias('tel'), col('data').json_value('$.addr.country', DataTypes.STRING()).alias('country')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | data | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | {"name": "Flink", "tel": 12... | Flink | 123 | Germany | # | +I | 2 | {"name": "hello", "tel": 13... | hello | 135 | China | # | +I | 3 | {"name": "world", "tel": 12... | world | 124 | USA | # | +I | 4 | {"name": "PyFlink", "tel": ... | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+ # drop columns table = table.drop_columns(col('data')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | tel | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # rename columns table = table.rename_columns(col('tel').alias('telephone')) table.execute().print() # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | op | id | name | telephone | country | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # | +I | 1 | Flink | 123 | Germany | # | +I | 2 | hello | 135 | China | # | +I | 3 | world | 124 | USA | # | +I | 4 | PyFlink | 32 | China | # +----+----------------------+--------------------------------+--------------------------------+--------------------------------+ # replace columns table = table.add_or_replace_columns( concat(col('id').cast(DataTypes.STRING()), '_', col('name')).alias('id')) table.execute().print()