def __init__(self, parallelism: int = 1, checkpoint_interval: Optional[int] = None, state_ttl: Optional[int] = None) -> None: # setting env env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(parallelism) if checkpoint_interval: env.set_state_backend( RocksDBStateBackend(self.checkpoints_path, enable_incremental_checkpointing=True)) env.enable_checkpointing(checkpoint_interval * 1000) t_config = TableConfig() if state_ttl: t_config.set_idle_state_retention_time( timedelta(seconds=state_ttl), timedelta(seconds=state_ttl + 300)) table_env = StreamTableEnvironment.create(env, table_config=t_config) table_env.get_config().get_configuration().set_string( "pipeline.jars", self.flink_sql_connector_kafka_jar) # set up table for ddl in self.tables: table_env.execute_sql(ddl) self.env = env self.table_env = table_env
def test_get_configuration(self): table_config = TableConfig.get_default() table_config.get_configuration().set_string("k1", "v1") self.assertEqual(table_config.get_configuration().get_string("k1", ""), "v1")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # used to test pipeline.jars and pipleline.classpaths config_key = sys.argv[1] config_value = sys.argv[2] t_env.get_config().get_configuration().set_string(config_key, config_value) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT, `count_java` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.sql_update(sink_ddl) t_env.sql_update( "create temporary system function add_one as 'add_one.add_one' language python" ) t_env.register_java_function("add_one_java", "org.apache.flink.python.tests.util.AddOne") elements = [(word, 0) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .select("word, add_one(count) as count, add_one_java(count) as count_java") \ .group_by("word") \ .select("word, count(count) as count, count(count_java) as count_java") \ .insert_into("Results") t_env.execute("word_count")
def setUp(self): super(PyFlinkOldBatchTableTestCase, self).setUp() self.env = ExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = BatchTableEnvironment.create(self.env, TableConfig()) self.t_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "1")
def create_table_env(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def test_get_set_local_timezone(self): table_config = TableConfig.get_default() table_config.set_local_timezone("Asia/Shanghai") timezone = table_config.get_local_timezone() self.assertEqual(timezone, "Asia/Shanghai")
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.env = ExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = BatchTableEnvironment.create(self.env, TableConfig()) self.t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", "80mb")
def test_get_set_idle_state_rentention(self): table_config = TableConfig.get_default() table_config.set_idle_state_retention(datetime.timedelta(days=1)) self.assertEqual(datetime.timedelta(days=1), table_config.get_idle_state_retention())
def demo02(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/input' ) """ my_sink_ddl = """ create table mySink ( word VARCHAR, `count` BIGINT ) with ( 'connector' = 'filesystem', 'format.type' = 'csv', 'connector.path' = 'F:/github/openjw/penter/bigdata_study/pyflink1.x/batch/demo01/output' ) """ t_env.execute_sql(my_source_ddl) t_env.execute_sql(my_sink_ddl) tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def demo01(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # StreamExecutionEnvironment t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') # 文件存在会报错 t_env.connect(FileSystem().path(r'F:\github\openjw\penter\bigdata_study\pyflink1.x\batch\demo01\output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') tab = t_env.from_path('mySource') tab.group_by(tab.word) \ .select(tab.word, lit(1).count) \ .execute_insert('mySink').wait()
def test_get_set_max_generated_code_length(self): table_config = TableConfig.get_default() table_config.set_max_generated_code_length(32000) max_generated_code_length = table_config.get_max_generated_code_length() self.assertEqual(max_generated_code_length, 32000)
def test_add_configuration(self): table_config = TableConfig.get_default() configuration = Configuration() configuration.set_string("k1", "v1") table_config.add_configuration(configuration) self.assertEqual(table_config.get("k1", ""), "v1")
def test_get_set_idle_state_retention_time(self): table_config = TableConfig.get_default() table_config.set_idle_state_retention_time( datetime.timedelta(days=1), datetime.timedelta(days=2)) self.assertEqual(3 * 24 * 3600 * 1000 / 2, table_config.get_max_idle_state_retention_time()) self.assertEqual(24 * 3600 * 1000, table_config.get_min_idle_state_retention_time())
def test_get_set_sql_dialect(self): table_config = TableConfig.get_default() sql_dialect = table_config.get_sql_dialect() self.assertEqual(sql_dialect, SqlDialect.DEFAULT) table_config.set_sql_dialect(SqlDialect.HIVE) sql_dialect = table_config.get_sql_dialect() self.assertEqual(sql_dialect, SqlDialect.HIVE)
def create_env( self) -> (ExecutionEnvironment, TableEnvironment, StatementSet): exec_env = StreamExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '80m') statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def test_get_set_null_check(self): table_config = TableConfig.get_default() null_check = table_config.get_null_check() self.assertTrue(null_check) table_config.set_null_check(False) null_check = table_config.get_null_check() self.assertFalse(null_check)
def word_count(): result = wikipedia.page("New York City") content = result.summary t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) print(add.add(10,5)) print("Word Count"); # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) #sink_ddl = """ # create table Results( # word VARCHAR, # `count` BIGINT # ) with ( # 'connector.type' = 'filesystem', # 'format.type' = 'csv', # 'connector.path' = '{}' # ) # """.format(result_path) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('Results') #t_env.sql_update(sink_ddl) elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() #t_config.set_python_executable("/opt/python38/bin/python3") # con/flink-conf.yaml 添加 python.client.executable: /usr/bin/python3 t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(result_path) t_env.execute_sql(sink_ddl) elements = [(word, 1) for word in content.split(" ")] table = t_env.from_elements(elements, ["word", "count"]) table.group_by(table.word) \ .select(table.word, expr.lit(1).count.alias('count')) \ .execute_insert("Results").wait()
def word_count(): content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("len", DataTypes.INT()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") t_env.register_java_function("len", "org.apache.flink.udf.UDFLength") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, len(word), count(1) as count") \ .insert_into("Results") t_env.execute("word_count")
def word_count(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) content = "line Licensed to the Apache Software Foundation ASF under one " \ "line or more contributor license agreements See the NOTICE file " \ "line distributed with this work for additional information " \ "line regarding copyright ownership The ASF licenses this file " \ "to you under the Apache License Version the " \ "License you may not use this file except in compliance " \ "with the License" with open(source_path, 'w') as f: for word in content.split(" "): f.write(",".join([word, "1"])) f.write("\n") f.flush() f.close() t_config = TableConfig.Builder().as_batch_execution().set_parallelism( 1).build() t_env = TableEnvironment.create(t_config) field_names = ["word", "cout"] field_types = [DataTypes.STRING, DataTypes.LONG] # register Orders table in table environment t_env.register_table_source( "Word", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink("Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Word") \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute()
def word_count(): f1 = open("/home/mnm/flink-1.9.1/1", "r") f2 = open("/home/mnm/flink-1.9.1/2", "r") f3 = open("/home/mnm/flink-1.9.1/3", "r") f4 = open("/home/mnm/flink-1.9.1/4", "r") f5 = open("/home/mnm/flink-1.9.1/5", "r") content = f1.read() + f2.read() + f3.read() + f4.read() + f5.read() t_config = TableConfig() env = ExecutionEnvironment.get_execution_environment() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment tmp_dir = tempfile.gettempdir() result_path = tmp_dir + '/result' if os.path.exists(result_path): try: if os.path.isfile(result_path): os.remove(result_path) else: shutil.rmtree(result_path) except OSError as e: logging.error("Error removing directory: %s - %s.", e.filename, e.strerror) logging.info("Results directory: %s", result_path) t_env.connect(FileSystem().path(result_path)) \ .with_format(OldCsv() .field_delimiter(',') .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .with_schema(Schema() .field("word", DataTypes.STRING()) .field("count", DataTypes.BIGINT())) \ .register_table_sink("Results") elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") t_env.execute("Python batch word count")
def test_get_set_decimal_context(self): table_config = TableConfig.get_default() table_config.set_decimal_context(20, "UNNECESSARY") self.assertEqual((20, "UNNECESSARY"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "HALF_EVEN") self.assertEqual((20, "HALF_EVEN"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "HALF_DOWN") self.assertEqual((20, "HALF_DOWN"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "HALF_UP") self.assertEqual((20, "HALF_UP"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "FLOOR") self.assertEqual((20, "FLOOR"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "CEILING") self.assertEqual((20, "CEILING"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "DOWN") self.assertEqual((20, "DOWN"), table_config.get_decimal_context()) table_config.set_decimal_context(20, "UP") self.assertEqual((20, "UP"), table_config.get_decimal_context())
def test_end_to_end(): tmp_dir = tempfile.gettempdir() source_path = tmp_dir + '/streaming.csv' if os.path.isfile(source_path): os.remove(source_path) with open(source_path, 'w') as f: lines = '1,hi,hello\n' + '2,hi,hello\n' f.write(lines) f.close() _find_flink_home() print("using %s as FLINK_HOME..." % os.environ["FLINK_HOME"]) t_config = TableConfig.Builder().as_streaming_execution().set_parallelism(1).build() t_env = TableEnvironment.get_table_environment(t_config) field_names = ["a", "b", "c"] field_types = [DataTypes.INT, DataTypes.STRING, DataTypes.STRING] # register Orders table in table environment t_env.register_table_source( "Orders", CsvTableSource(source_path, field_names, field_types)) # register Results table in table environment tmp_dir = tempfile.gettempdir() tmp_csv = tmp_dir + '/streaming2.csv' if os.path.isfile(tmp_csv): os.remove(tmp_csv) t_env.register_table_sink( "Results", field_names, field_types, CsvTableSink(tmp_csv)) t_env.scan("Orders") \ .where("a > 0") \ .select("a + 1, b, c") \ .insert_into("Results") t_env.execute() with open(tmp_csv, 'r') as f: lines = f.read() assert lines == '2,hi,hello\n' + '3,hi,hello\n' print("test passed, the log file is under this directory: %s/log" % os.environ["FLINK_HOME"])
def word_count(): # declare a table environment, set configurations. env = ExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(env, t_config) # register Results table in table environment output_file = os.path.abspath('.') + '/out.txt' if os.path.exists(output_file): try: if os.path.isfile(output_file): os.remove(output_file) except OSError as e: print("Error", e.filename, e.strerror) print("Results:", output_file) sink_ddl = """ create table Results( word VARCHAR, `count` BIGINT ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = '{}' ) """.format(output_file) t_env.sql_update(sink_ddl) # create the source table with a single string # preforms some transformations, and writes the results to table Results content = "Who's there? I think I hear them. Stand, ho! Who's there?" elements = [(word, 1) for word in content.split(" ")] t_env.from_elements(elements, ["word", "count"]) \ .group_by("word") \ .select("word, count(1) as count") \ .insert_into("Results") # execute the Flink Python Table API job t_env.execute("word_count")
def run(self): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('/tmp/input')) \ .with_format(OldCsv() .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') model = Model.fromFile('./../batch_ml/model.pmml') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job") self.read_data() result = model.predict({ "Sepal_Length": 5.1, "Sepal_Width": 3.5, "Petal_Length": 1.4, "Petal_Width": 0.2 })
class main(): exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) # t_env.connect(FileSystem().path('./temp/deviceorientation')) \ # .with_format(OldCsv() # .field('word', DataTypes.STRING())) \ # .with_schema(Schema() # .field('word', DataTypes.STRING())) \ # .create_temporary_table('mySource') my_source_ddl = """ create table mySource ( word VARCHAR ) with ( 'connector.type' = 'filesystem', 'format.type' = 'csv', 'connector.path' = './temp/input' ) """ t_env.sql_update(my_source_ddl) t_env.connect(FileSystem().path('/tmp/output')) \ .with_format(OldCsv() .field_delimiter('\t') .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .with_schema(Schema() .field('word', DataTypes.STRING()) .field('count', DataTypes.BIGINT())) \ .create_temporary_table('mySink') t_env.from_path('mySource') \ .group_by('word') \ .select('word, count(1)') \ .insert_into('mySink') t_env.execute("tutorial_job")
def setUp(self): super(PyFlinkStreamTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_streaming_execution( ).set_parallelism(4).build() self.t_env = TableEnvironment.get_table_environment(self.t_config)
from pyflink.dataset import ExecutionEnvironment from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment from pyflink.table.descriptors import Schema, OldCsv, FileSystem exec_env = ExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(2) t_config = TableConfig() t_env = BatchTableEnvironment.create(exec_env, t_config) t_env.connect(FileSystem().path('input')) \ .with_format(OldCsv() .line_delimiter(' ') .field('word', DataTypes.STRING())) \ .with_schema(Schema() .field('word', DataTypes.STRING())) \ .register_table_source("inputSource") t_env.connect(FileSystem().path('output')) \ .with_format(OldCsv().field_delimiter(',').field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .with_schema(Schema().field('word', DataTypes.STRING()).field('count', DataTypes.BIGINT()))\ .register_table_sink('sink') t_env.scan('inputSource').group_by('word').select('word, count(1)').insert_into('sink') t_env.execute('my first job')
def test_set_get_python_executable(self): table_config = TableConfig() table_config.set_python_executable("/usr/bin/python3") self.assertEqual("/usr/bin/python3", table_config.get_python_executable())
def setUp(self): super(PyFlinkBatchTableTestCase, self).setUp() self.t_config = TableConfig.Builder().as_batch_execution( ).set_parallelism(1).build() self.t_env = TableEnvironment.create(self.t_config)