def inner_join_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) result_file = "/tmp/table_inner_join_streaming.csv" if os.path.exists(result_file): os.remove(result_file) left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") st_env.register_table_sink( "result", CsvTableSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()], result_file)) result = left.join(right).where("a = d").select("a, b, e") result.insert_into("result") st_env.execute("inner join streaming")
def add_train_chief_alone_table(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) statement_set = table_env.create_statement_set() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/add.py" func = "map_func" prop = {} prop[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true" prop[MLCONSTANTS.PYTHON_VERSION] = "3.7" env_path = None input_tb = None output_schema = None tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) # inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) job_client = statement_set.execute().get_job_client() if job_client is not None: job_client.get_job_execution_result( user_class_loader=None).result()
def test_execute(self): tmp_dir = tempfile.gettempdir() field_names = ['a', 'b', 'c'] field_types = [ DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING() ] t_env = StreamTableEnvironment.create(self.env) t_env.register_table_sink( 'Results', CsvTableSink( field_names, field_types, os.path.join('{}/{}.csv'.format(tmp_dir, round(time.time()))))) t_env.insert_into( 'Results', t_env.from_elements([(1, 'Hi', 'Hello')], ['a', 'b', 'c'])) execution_result = t_env.execute('test_stream_execute') self.assertIsNotNone(execution_result.get_job_id()) self.assertIsNotNone(execution_result.get_net_runtime()) self.assertEqual(len(execution_result.get_all_accumulator_results()), 0) self.assertIsNone( execution_result.get_accumulator_result('accumulator')) self.assertIsNotNone(str(execution_result))
def __init__(self): # self.feature_extractor = DemoFeatureExtractor() self.settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(1) self.table_env = StreamTableEnvironment.create( self.env, environment_settings=self.settings) self.table_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) self.table_env.get_config().get_configuration().set_string( "python.fn-execution.buffer.memory.size", "1024mb") self.table_env.get_config().get_configuration().set_string( "parallelism.default", "3") self.table_env.get_config().get_configuration().set_string( "python.fn-execution.bundle.size", "5000") self.table_env.get_config().get_configuration().set_string( "restart-strategy", "fixed-delay") self.table_env.get_config().get_configuration().set_string( "restart-strategy.fixed-delay.attempts", "3") self.table_env.get_config().get_configuration().set_string( "restart-strategy.fixed-delay.delay", "30s") source_table = open('source.sql', 'r').read() sink_table = open('sink.sql', 'r').read() self.table_env.execute_sql(source_table) self.table_env.execute_sql(sink_table)
def select_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_select_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "c"], [DataTypes.STRING(), DataTypes.INT()], result_file)) orders = st_env.scan("Orders") result = orders.select("a, b") result.insert_into("result") st_env.execute("select streaming")
def __init__(self, parallelism: int = 1, checkpoint_interval: Optional[int] = None, state_ttl: Optional[int] = None) -> None: # setting env env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(parallelism) if checkpoint_interval: env.set_state_backend( RocksDBStateBackend(self.checkpoints_path, enable_incremental_checkpointing=True)) env.enable_checkpointing(checkpoint_interval * 1000) t_config = TableConfig() if state_ttl: t_config.set_idle_state_retention_time( timedelta(seconds=state_ttl), timedelta(seconds=state_ttl + 300)) table_env = StreamTableEnvironment.create(env, table_config=t_config) table_env.get_config().get_configuration().set_string( "pipeline.jars", self.flink_sql_connector_kafka_jar) # set up table for ddl in self.tables: table_env.execute_sql(ddl) self.env = env self.table_env = table_env
def main(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() t_env = StreamTableEnvironment.create(env, environment_settings=environment_settings) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.size", 300000) t_env.get_config().get_configuration().set_integer("python.fn-execution.bundle.time", 1000) t_env.get_config().get_configuration().set_boolean("pipeline.object-reuse", True) t_env.register_table_sink( "sink", PrintTableSink( ["id"], [DataTypes.INT(False)])) @udf(input_types=[DataTypes.INT(False)], result_type=DataTypes.INT(False)) def inc(x): return x + 1 t_env.register_function("inc", inc) t_env.register_java_function("java_inc", "com.alibaba.flink.function.JavaInc") num_rows = 100000000 t_env.from_table_source(RangeTableSource(1, num_rows, 1)).alias("id") \ .select("inc(id)") \ .insert_into("sink") beg_time = time.time() t_env.execute("Python UDF") print("PyFlink Python UDF inc() consume time: " + str(time.time() - beg_time))
def test_table_environment_with_blink_planner(self): self.env.set_parallelism(1) t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().build()) source_path = os.path.join(self.tempdir + '/streaming.csv') sink_path = os.path.join(self.tempdir + '/result.csv') field_names = ["a", "b", "c"] field_types = [DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING()] data = [(1, 'hi', 'hello'), (2, 'hello', 'hello')] csv_source = self.prepare_csv_source(source_path, data, field_types, field_names) t_env.register_table_source("source", csv_source) t_env.register_table_sink( "sink", CsvTableSink(field_names, field_types, sink_path)) source = t_env.scan("source") result = source.alias("a, b, c").select("1 + a, b, c") result.insert_into("sink") t_env.execute("blink_test") results = [] with open(sink_path, 'r') as f: results.append(f.readline()) results.append(f.readline()) self.assert_equals(results, ['2,hi,hello\n', '3,hello,hello\n'])
def main_flink(): #之前的步骤是拿文件然后预处理然后写文件到input这个文件中 env = StreamExecutionEnvironment.get_execution_environment() parr_num = 4 env.set_parallelism(parr_num) t_env = StreamTableEnvironment.create(env) @udf(input_types=DataTypes.STRING(), result_type=DataTypes.STRING()) def cut_extract(string): return cut_posseg.cut_extract(string) t_env.register_function("cut_extract",cut_extract) #这里是建表然后从input拿,问题1:有没有办法从自定义的list来当做输入来节省IO开销呢,比如我输入[文本A,文本B]这样的list作为输入 t_env.connect(FileSystem().path('/home/sjtu/input')) \ .with_format(OldCsv() .field('text', DataTypes.STRING())) \ .with_schema(Schema() .field('text', DataTypes.STRING())) \ .create_temporary_table('mySource') t_env.connect(FileSystem().path('/home/sjtu/output')) \ .with_format(OldCsv() .field('result', DataTypes.STRING())) \ .with_schema(Schema() .field('result', DataTypes.STRING())) \ .create_temporary_table('mySink') t_env.from_path('mySource')\ .select("cut_extract(text)")\ .insert_into('mySink') #问题2:这里我是将结果的表写了文件,但实际上我还需要在这个代码中继续对这些处理完的数据进行处理,也没有办法直接将上述的mySink表直接 #作为内存数据取出来而不是从硬盘再读入呢 t_env.execute("tutorial_job")
def init_env(self, **kwargs): env = StreamExecutionEnvironment.get_execution_environment() self.st_env = StreamTableEnvironment.create( env, environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build() ) return
def train(stream_env=None, table_env=None, statement_set=None, input_table=None, tf_config=None, output_schema=None): """ Run TF train for flink table api. :param stream_env: The StreamExecutionEnvironment. If it's None, this method will create one and execute the job at the end. Otherwise, caller is responsible to trigger the job execution :param table_env: The Flink TableEnvironment. :param statement_set: The StatementSet created by the given TableEnvironment. :param input_table: The input Table. :param tf_config: Configurations for the TF program. :param output_schema: The TableSchema for the output Table. If it's null, a dummy sink will be connected. :return: output Table. Otherwise, caller is responsible to add sink to the output Table before executing the graph. """ if stream_env is None: stream_env = StreamExecutionEnvironment.get_execution_environment() if table_env is None: table_env = StreamTableEnvironment.create(stream_env) if statement_set is None: statement_set = table_env.create_statement_set() if input_table is not None: input_table = input_table._j_table if output_schema is not None: output_schema = output_schema._j_table_schema output_table = get_gateway( ).jvm.org.flinkextended.flink.ml.tensorflow.client.TFUtils.train( stream_env._j_stream_execution_environment, table_env._j_tenv, statement_set._j_statement_set, input_table, tf_config.java_config(), output_schema) return Table(output_table, table_env)
def test_custom_env(self): import pyflink from pyflink.dataset import ExecutionEnvironment from pyflink.datastream import StreamExecutionEnvironment benv = ExecutionEnvironment.get_execution_environment() senv = StreamExecutionEnvironment.get_execution_environment() from pyflink.table import BatchTableEnvironment from pyflink.table import StreamTableEnvironment btenv = BatchTableEnvironment.create(benv) stenv = StreamTableEnvironment.create(senv) mlenv = useCustomEnv(pyflink.java_gateway.get_gateway(), benv, btenv, senv, stenv) t = mlenv.btenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) source = TableSourceBatchOp(t) source.print() t = mlenv.stenv.from_elements([(1, 2), (2, 5), (3, 1)], ['a', 'b']) source = TableSourceStreamOp(t) source.print() StreamOperator.execute() from pyalink.alink import env env._in_custom_env = False resetEnv()
def full_outer_join_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) left = st_env.from_elements([(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], ["a", "b", "c"]).select("a, b, c") right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], ["d", "e", "f"]).select("d, e, f") result = left.full_outer_join(right, "a = d").select("a, b, e") # use custom retract sink connector sink = TestRetractSink( ["a", "b", "c"], [DataTypes.BIGINT(), DataTypes.STRING(), DataTypes.STRING()]) st_env.register_table_sink("sink", sink) result.insert_into("sink") st_env.execute("full outer join streaming")
def input_output_table(): stream_env = StreamExecutionEnvironment.get_execution_environment() table_env = StreamTableEnvironment.create(stream_env) statement_set = table_env.create_statement_set() work_num = 2 ps_num = 1 python_file = os.getcwd() + "/../../src/test/python/input_output.py" prop = {} func = "map_func" env_path = None prop[ MLCONSTANTS. ENCODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding" prop[ MLCONSTANTS. DECODING_CLASS] = "org.flinkextended.flink.ml.operator.coding.RowCSVCoding" inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING" prop["sys:csv_encode_types"] = inputSb prop["sys:csv_decode_types"] = inputSb prop[MLCONSTANTS.PYTHON_VERSION] = "3.7" source_file = os.getcwd() + "/../../src/test/resources/input.csv" sink_file = os.getcwd() + "/../../src/test/resources/output.csv" table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ]) table_env.register_table_source("source", table_source) input_tb = table_env.from_path("source") output_schema = TableSchema(["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ]) sink = CsvTableSink(["a", "b", "c", "d", "e"], [ DataTypes.INT(), DataTypes.BIGINT(), DataTypes.FLOAT(), DataTypes.DOUBLE(), DataTypes.STRING() ], sink_file, write_mode=WriteMode.OVERWRITE) table_env.register_table_sink("table_row_sink", sink) tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path) output_table = train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) # output_table = inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema) statement_set.add_insert("table_row_sink", output_table) job_client = statement_set.execute().get_job_client() if job_client is not None: job_client.get_job_execution_result( user_class_loader=None).result()
def filter_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" result_file = "/tmp/table_filter_streaming.csv" if os.path.exists(result_file): os.remove(result_file) st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.register_table_sink( "result", CsvTableSink(["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ], result_file)) orders = st_env.scan("Orders") result = orders.filter("b % 2 === 0") result.insert_into("result") st_env.execute("filter streaming")
def log_processing(): env = StreamExecutionEnvironment.get_execution_environment() env_settings = EnvironmentSettings.Builder().use_blink_planner().build() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=env_settings) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) source_ddl = """ CREATE TABLE payment_msg( createTime VARCHAR, orderId BIGINT, payAmount DOUBLE, payPlatform INT, provinceId INT ) WITH ( 'connector.type' = 'kafka', 'connector.version' = 'universal', 'connector.topic' = 'payment_msg', 'connector.properties.bootstrap.servers' = 'kafka:9092', 'connector.properties.group.id' = 'test_3', 'connector.startup-mode' = 'latest-offset', 'format.type' = 'json' ) """ es_sink_ddl = """ CREATE TABLE es_sink ( province VARCHAR PRIMARY KEY, pay_amount DOUBLE ) with ( 'connector.type' = 'elasticsearch', 'connector.version' = '7', 'connector.hosts' = 'http://elasticsearch:9200', 'connector.index' = 'platform_pay_amount_1', 'connector.document-type' = 'payment', 'update-mode' = 'upsert', 'connector.flush-on-checkpoint' = 'true', 'connector.key-delimiter' = '$', 'connector.key-null-literal' = 'n/a', 'connector.bulk-flush.max-size' = '42mb', 'connector.bulk-flush.max-actions' = '32', 'connector.bulk-flush.interval' = '1000', 'connector.bulk-flush.backoff.delay' = '1000', 'format.type' = 'json' ) """ t_env.sql_update(source_ddl) t_env.sql_update(es_sink_ddl) t_env.register_function('province_id_to_name', province_id_to_name) t_env.from_path("payment_msg") \ .select("province_id_to_name(provinceId) as province, payAmount") \ .group_by("province") \ .select("province, sum(payAmount) as pay_amount") \ .insert_into("es_sink") t_env.execute("payment_demo")
def setUp(self): super(PyFlinkBlinkStreamTableTestCase, self).setUp() self.env = StreamExecutionEnvironment.get_execution_environment() self.env.set_parallelism(2) self.t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build())
def create_env( self) -> (ExecutionEnvironment, TableEnvironment, StatementSet): exec_env = StreamExecutionEnvironment.get_execution_environment() exec_env.set_parallelism(1) t_config = TableConfig() t_env = StreamTableEnvironment.create(exec_env, t_config) t_env.get_config().get_configuration().set_string( "taskmanager.memory.task.off-heap.size", '80m') statement_set = t_env.create_statement_set() return exec_env, t_env, statement_set
def test_create_table_environment_with_blink_planner(self): t_env = StreamTableEnvironment.create( self.env, environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build()) planner = t_env._j_tenv.getPlanner() self.assertEqual( planner.getClass().getName(), "org.apache.flink.table.planner.delegation.StreamPlanner")
def group_by_agg_streaming(): s_env = StreamExecutionEnvironment.get_execution_environment() s_env.set_parallelism(1) # use blink table planner st_env = StreamTableEnvironment.create( s_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) # use flink table planner # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" st_env.register_table_source( "Orders", CsvTableSource(source_file, ["a", "b", "c", "rowtime"], [ DataTypes.STRING(), DataTypes.INT(), DataTypes.INT(), DataTypes.TIMESTAMP() ])) st_env.connect( Elasticsearch() .version("6") .host("localhost", 9200, "http") .index("group_by_agg_streaming") .document_type('pyflink') .key_delimiter("_") .key_null_literal("null") .failure_handler_ignore() .disable_flush_on_checkpoint() .bulk_flush_max_actions(2) .bulk_flush_max_size("1 mb") .bulk_flush_interval(5000) ) \ .with_schema( Schema() .field("a", DataTypes.STRING()) .field("b", DataTypes.STRING()) ) \ .with_format( Json() .derive_schema() ) \ .in_upsert_mode() \ .register_table_sink("result") orders = st_env.scan("Orders") groub_by_table = orders.group_by("a").select("a, b.sum as d") # Because the schema of index user in elasticsearch is # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} # so we need to cast the type in our demo. st_env.register_table("group_table", groub_by_table) result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table") result.insert_into("result") st_env.execute("group by agg streaming")
def test_construct_with_stream_env(self): stream_execution_environment = StreamExecutionEnvironment.get_execution_environment() stream_table_environment = StreamTableEnvironment.create(stream_execution_environment) ml_environment = MLEnvironment( stream_exe_env=stream_execution_environment, stream_tab_env=stream_table_environment) self.assertEqual( ml_environment.get_stream_execution_environment(), stream_execution_environment) self.assertEqual(ml_environment.get_stream_table_environment(), stream_table_environment)
def get_stream_table_environment(self) -> StreamTableEnvironment: """ Get the StreamTableEnvironment. If the StreamTableEnvironment has not been set, it initial the StreamTableEnvironment with default Configuration. :return: the StreamTableEnvironment. """ if self._stream_tab_env is None: self._stream_tab_env = StreamTableEnvironment.create( StreamExecutionEnvironment.get_execution_environment()) return self._stream_tab_env
def setUp(self): self.env = StreamExecutionEnvironment.get_execution_environment() self._load_dependency_jars() config = Configuration( j_configuration=get_j_env_configuration(self.env._j_stream_execution_environment)) config.set_boolean("execution.checkpointing.checkpoints-after-tasks-finish.enabled", True) self.env.set_parallelism(4) self.env.enable_checkpointing(100) self.env.set_restart_strategy(RestartStrategies.no_restart()) self.t_env = StreamTableEnvironment.create(self.env) self.temp_dir = tempfile.mkdtemp()
def setUp(self) -> None: from pyflink.datastream import StreamExecutionEnvironment super(DataStreamConversionTestCases, self).setUp() config = Configuration() config.set_string("akka.ask.timeout", "20 s") self.env = StreamExecutionEnvironment.get_execution_environment(config) self.t_env = StreamTableEnvironment.create(self.env) self.env.set_parallelism(2) self.t_env.get_config().set("python.fn-execution.bundle.size", "1") self.test_sink = DataStreamTestSinkFunction()
def pandas_udaf(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP_LTZ(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts, name, price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .column('w_start', DataTypes.TIMESTAMP_LTZ()) .column('w_end', DataTypes.TIMESTAMP_LTZ()) .build()) .build()) @udaf(result_type=DataTypes.FLOAT(), func_type="pandas") def mean_udaf(v): return v.mean() # define the tumble window operation table = table.window(Tumble.over(lit(5).seconds).on(col("ts")).alias("w")) \ .group_by(table.name, col('w')) \ .select(table.name, mean_udaf(table.price), col("w").start, col("w").end) # submit for execution table.execute_insert('sink') \ .wait()
def create_table_env(self): stream_env = StreamExecutionEnvironment.get_execution_environment() stream_env.set_parallelism(1) t_env = StreamTableEnvironment.create( stream_env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) statement_set = t_env.create_statement_set() t_env.get_config().set_python_executable('/usr/bin/python3') t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) return stream_env, t_env, statement_set
def hello_world(): """ 从随机Source读取数据,然后直接利用PrintSink输出。 """ settings = EnvironmentSettings.new_instance().in_streaming_mode( ).use_blink_planner().build() env = StreamExecutionEnvironment.get_execution_environment() t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings) t_env.get_config().get_configuration().set_boolean( "python.fn-execution.memory.managed", True) source_ddl = """ CREATE TABLE random_source ( f_sequence INT, f_random INT, f_random_str STRING ) WITH ( 'connector' = 'datagen', 'rows-per-second'='5', 'fields.f_sequence.kind'='sequence', 'fields.f_sequence.start'='1', 'fields.f_sequence.end'='1000', 'fields.f_random.min'='1', 'fields.f_random.max'='1000', 'fields.f_random_str.length'='10' ) """ sink_ddl = """ CREATE TABLE print_sink ( f_sequence INT, f_random INT, f_random_str STRING ) WITH ( 'connector' = 'print' ) """ # 注册source和sink t_env.execute_sql(source_ddl) t_env.execute_sql(sink_ddl) # 注册 UDF t_env.register_function('pass_by', pass_by) # 数据提取 tab = t_env.from_path("random_source") # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进... tab.select("f_sequence, f_random, pass_by(f_random_str) ").insert_into( "print_sink") # 执行作业 t_env.execute("Flink Hello World")
def main(): env = StreamExecutionEnvironment.get_execution_environment() env.add_jars("file:///app/src/kafka-clients-2.8.0.jar") env.add_jars("file:///app/src/flink-connector-kafka_2.12-1.12.3.jar") env.set_parallelism(1) env.set_stream_time_characteristic(TimeCharacteristic.EventTime) env.enable_checkpointing(60000, CheckpointingMode.EXACTLY_ONCE) config = env.get_checkpoint_config() config.enable_externalized_checkpoints( ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION) st_env = StreamTableEnvironment.create( env, environment_settings=EnvironmentSettings.new_instance( ).in_streaming_mode().use_blink_planner().build()) print("register kafka source") register_kafka_source(st_env) print("register transaction sinks") register_transactions_sink_into_csv(st_env) st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select("""count(message) as total, w.end as end_time """) \ .insert_into("total_sink") st_env.from_path("source_tbl") \ .where("message = 'dolorem'") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w")) \ .select(""" count(message) as total, w.end as end_time """) \ .insert_into("grep_sink") st_env.from_path("source_tbl") \ .window(Slide.over(lit(1).minute).every(lit(5).seconds).on("ts").alias("w")) \ .group_by(col("w"), col("message")) \ .select(""" count(message) as total, message, w.end as end_time """) \ .insert_into("topk_sink") st_env.execute("app")
def test_add_python_file(self): import uuid env = self.env python_file_dir = os.path.join(self.tempdir, "python_file_dir_" + str(uuid.uuid4())) os.mkdir(python_file_dir) python_file_path = os.path.join(python_file_dir, "test_dep1.py") with open(python_file_path, 'w') as f: f.write("def add_two(a):\n return a + 2") def plus_two_map(value): from test_dep1 import add_two return add_two(value) get_j_env_configuration(env._j_stream_execution_environment).\ setString("taskmanager.numberOfTaskSlots", "10") env.add_python_file(python_file_path) ds = env.from_collection([1, 2, 3, 4, 5]) ds = ds.map(plus_two_map, Types.LONG()) \ .slot_sharing_group("data_stream") \ .map(lambda i: i, Types.LONG()) \ .slot_sharing_group("table") python_file_path = os.path.join(python_file_dir, "test_dep2.py") with open(python_file_path, 'w') as f: f.write("def add_three(a):\n return a + 3") def plus_three(value): from test_dep2 import add_three return add_three(value) t_env = StreamTableEnvironment.create( stream_execution_environment=env, environment_settings=EnvironmentSettings.in_streaming_mode()) env.add_python_file(python_file_path) from pyflink.table.udf import udf from pyflink.table.expressions import col add_three = udf(plus_three, result_type=DataTypes.BIGINT()) tab = t_env.from_data_stream(ds, col('a')) \ .select(add_three(col('a'))) t_env.to_append_stream(tab, Types.ROW([Types.LONG()])) \ .map(lambda i: i[0]) \ .add_sink(self.test_sink) env.execute("test add_python_file") result = self.test_sink.get_results(True) expected = ['6', '7', '8', '9', '10'] result.sort() expected.sort() self.assertEqual(expected, result)
def tumble_window_demo(): env = StreamExecutionEnvironment.get_execution_environment() env.set_parallelism(1) t_env = StreamTableEnvironment.create(stream_execution_environment=env) # define the source with watermark definition ds = env.from_collection( collection=[ (Instant.of_epoch_milli(1000), 'Alice', 110.1), (Instant.of_epoch_milli(4000), 'Bob', 30.2), (Instant.of_epoch_milli(3000), 'Alice', 20.0), (Instant.of_epoch_milli(2000), 'Bob', 53.1), (Instant.of_epoch_milli(5000), 'Alice', 13.1), (Instant.of_epoch_milli(3000), 'Bob', 3.1), (Instant.of_epoch_milli(7000), 'Bob', 16.1), (Instant.of_epoch_milli(10000), 'Alice', 20.1) ], type_info=Types.ROW([Types.INSTANT(), Types.STRING(), Types.FLOAT()])) table = t_env.from_data_stream( ds, Schema.new_builder() .column_by_expression("ts", "CAST(f0 AS TIMESTAMP(3))") .column("f1", DataTypes.STRING()) .column("f2", DataTypes.FLOAT()) .watermark("ts", "ts - INTERVAL '3' SECOND") .build() ).alias("ts", "name", "price") # define the sink t_env.create_temporary_table( 'sink', TableDescriptor.for_connector('print') .schema(Schema.new_builder() .column('name', DataTypes.STRING()) .column('total_price', DataTypes.FLOAT()) .build()) .build()) # define the over window operation table = table.over_window( Over.partition_by(col("name")) .order_by(col("ts")) .preceding(row_interval(2)) .following(CURRENT_ROW) .alias('w')) \ .select(table.name, table.price.max.over(col('w'))) # submit for execution table.execute_insert('sink') \ .wait()