Example #1
0
 def setUp(self):
     super(PyFlinkStreamTableTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = StreamTableEnvironment.create(
         self.env,
         environment_settings=EnvironmentSettings.new_instance(
         ).in_streaming_mode().use_old_planner().build())
Example #2
0
 def setUp(self):
     super(PyFlinkBlinkStreamTableTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = StreamTableEnvironment.create(
         self.env, environment_settings=EnvironmentSettings.new_instance()
             .in_streaming_mode().use_blink_planner().build())
     self.t_env.get_config().get_configuration().set_string(
         "taskmanager.memory.task.off-heap.size", "80mb")
Example #3
0
 def setUp(self):
     super(PyFlinkBlinkStreamTableTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.t_env = StreamTableEnvironment.create(
         self.env, environment_settings=EnvironmentSettings.new_instance()
             .in_streaming_mode().use_blink_planner().build())
     self.t_env.get_config().get_configuration().set_string(
         "python.fn-execution.bundle.size", "1")
    def input_output_table():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        statement_set = table_env.create_statement_set()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/input_output.py"
        prop = {}
        func = "map_func"
        env_path = None
        prop[MLCONSTANTS.ENCODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding"
        prop[MLCONSTANTS.DECODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding"
        inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING"
        prop["sys:csv_encode_types"] = inputSb
        prop["sys:csv_decode_types"] = inputSb
        prop[MLCONSTANTS.PYTHON_VERSION] = "3.7"
        source_file = os.getcwd() + "/../../src/test/resources/input.csv"
        sink_file = os.getcwd() + "/../../src/test/resources/output.csv"
        table_source = CsvTableSource(source_file,
                                      ["a", "b", "c", "d", "e"],
                                      [DataTypes.INT(),
                                       DataTypes.BIGINT(),
                                       DataTypes.FLOAT(),
                                       DataTypes.DOUBLE(),
                                       DataTypes.STRING()])
        table_env.register_table_source("source", table_source)
        input_tb = table_env.from_path("source")
        output_schema = TableSchema(["a", "b", "c", "d", "e"],
                                    [DataTypes.INT(),
                                     DataTypes.BIGINT(),
                                     DataTypes.FLOAT(),
                                     DataTypes.DOUBLE(),
                                     DataTypes.STRING()]
                                    )
        sink = CsvTableSink(["a", "b", "c", "d", "e"],
                            [DataTypes.INT(),
                             DataTypes.BIGINT(),
                             DataTypes.FLOAT(),
                             DataTypes.DOUBLE(),
                             DataTypes.STRING()],
                            sink_file,
                            write_mode=WriteMode.OVERWRITE)
        table_env.register_table_sink("table_row_sink", sink)
        tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path)
        output_table = train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        # output_table = inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        statement_set.add_insert("table_row_sink", output_table)
        job_client = statement_set.execute().get_job_client()
        if job_client is not None:
            job_client.get_job_execution_result(user_class_loader=None).result()
Example #5
0
    def get_stream_table_environment(self) -> StreamTableEnvironment:
        """
        Get the StreamTableEnvironment. If the StreamTableEnvironment has not been set,
        it initial the StreamTableEnvironment with default Configuration.

        :return: the StreamTableEnvironment.

        .. versionadded:: 1.11.0
        """
        if self._stream_tab_env is None:
            self._stream_tab_env = StreamTableEnvironment.create(
                StreamExecutionEnvironment.get_execution_environment())
        return self._stream_tab_env
Example #6
0
    def addTrainStream():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/add.py"
        func = "map_func"
        property = None
        env_path = None
        zk_conn = None
        zk_base_path = None
        input_ds = None
        output_row_type = None

        train(work_num, ps_num, python_file, func, property, env_path, zk_conn,
              zk_base_path, stream_env, input_ds, output_row_type)
    def addTrainTable():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/add.py"
        func = "map_func"
        property = None
        env_path = None
        zk_conn = None
        zk_base_path = None
        input_tb = None
        output_schema = None

        train(work_num, ps_num, python_file, func, property, env_path, zk_conn,
              zk_base_path, stream_env, table_env, input_tb, output_schema)
Example #8
0
    def addTrainChiefAloneStream():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/add.py"
        func = "map_func"
        property = {}
        property[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true"
        env_path = None
        zk_conn = None
        zk_base_path = None
        input_ds = None
        output_row_type = None

        train(work_num, ps_num, python_file, func, property, env_path, zk_conn,
              zk_base_path, stream_env, input_ds, output_row_type)
    def fit(self, *inputs: Table) -> 'TensorflowModel':
        if len(inputs) == 0:
            if self.table_env is None:
                raise RuntimeError(
                    "table_env should not be None if inputs is not given")
            input_table = None
            t_env = self.table_env
        else:
            input_table = inputs[0]
            t_env = input_table._t_env

        statement_set = self.statement_set if self.statement_set \
            else t_env.create_statement_set()
        env = StreamExecutionEnvironment(t_env._j_tenv.execEnv())
        train(env, t_env, statement_set, input_table, self.tf_config)
        return TensorflowModel(self.tf_config, statement_set,
                               self.predict_col_names, self.predict_data_types)
Example #10
0
    def worker_zero_finish():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        work_num = 3
        ps_num = 2
        python_file = os.getcwd() + "/../../src/test/python/worker_0_finish.py"
        func = "map_func"
        prop = {MLCONSTANTS.PYTHON_VERSION: '3.7'}
        env_path = None
        input_tb = None
        output_schema = None

        tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path)
        train(stream_env, table_env, input_tb, tf_config, output_schema)
        # inference(stream_env, table_env, input_tb, tf_config, output_schema)

        table_env.execute("train")
Example #11
0
    def add_train_chief_alone_table():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/add.py"
        func = "map_func"
        prop = {}
        prop[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true"
        prop[MLCONSTANTS.PYTHON_VERSION] = "3.7"
        env_path = None
        input_tb = None
        output_schema = None

        tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path)

        train(stream_env, table_env, input_tb, tf_config, output_schema)
        # inference(stream_env, table_env, input_tb, tf_config, output_schema)

        table_env.execute("train")
Example #12
0
    def get_default() -> Optional[MLEnvironment]:
        """
        Get the MLEnvironment use the default MLEnvironmentId.

        :return: the default MLEnvironment.

        .. versionadded:: 1.11.0
        """
        with MLEnvironmentFactory._lock:
            if MLEnvironmentFactory._map[MLEnvironmentFactory._default_ml_environment_id] is None:
                j_ml_env = get_gateway().\
                    jvm.org.apache.flink.ml.common.MLEnvironmentFactory.getDefault()
                ml_env = MLEnvironment(
                    ExecutionEnvironment(j_ml_env.getExecutionEnvironment()),
                    StreamExecutionEnvironment(j_ml_env.getStreamExecutionEnvironment()),
                    BatchTableEnvironment(j_ml_env.getBatchTableEnvironment()),
                    StreamTableEnvironment(j_ml_env.getStreamTableEnvironment()))
                MLEnvironmentFactory._map[MLEnvironmentFactory._default_ml_environment_id] = ml_env

            return MLEnvironmentFactory._map[MLEnvironmentFactory._default_ml_environment_id]
Example #13
0
    def worker_zero_finish():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        statement_set = table_env.create_statement_set()
        work_num = 3
        ps_num = 2
        python_file = os.getcwd() + "/../../src/test/python/worker_0_finish.py"
        func = "map_func"
        prop = {MLCONSTANTS.PYTHON_VERSION: '3.7'}
        env_path = None
        input_tb = None
        output_schema = None

        tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path)
        train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        # inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        job_client = statement_set.execute().get_job_client()
        if job_client is not None:
            job_client.get_job_execution_result(user_class_loader=None).result()
 def inputOutputTable():
     stream_env = StreamExecutionEnvironment.get_execution_environment()
     table_env = StreamTableEnvironment.create(stream_env)
     work_num = 2
     ps_num = 1
     python_file = os.getcwd() + "/../../src/test/python/input_output.py"
     property = {}
     func = "map_func"
     env_path = None
     zk_conn = None
     zk_base_path = None
     property[
         MLCONSTANTS.
         ENCODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding"
     property[
         MLCONSTANTS.
         DECODING_CLASS] = "com.alibaba.flink.ml.operator.coding.RowCSVCoding"
     inputSb = "INT_32" + "," + "INT_64" + "," + "FLOAT_32" + "," + "FLOAT_64" + "," + "STRING"
     property["SYS:csv_encode_types"] = inputSb
     property["SYS:csv_decode_types"] = inputSb
     source_file = os.getcwd() + "/../../src/test/resources/input.csv"
     table_source = CsvTableSource(source_file, ["a", "b", "c", "d", "e"], [
         DataTypes.INT(),
         DataTypes.INT(),
         DataTypes.FLOAT(),
         DataTypes.DOUBLE(),
         DataTypes.STRING()
     ])
     table_env.register_table_source("source", table_source)
     input_tb = table_env.scan("source")
     output_schema = TableSchema(["a", "b", "c", "d", "e"], [
         DataTypes.INT(),
         DataTypes.INT(),
         DataTypes.FLOAT(),
         DataTypes.DOUBLE(),
         DataTypes.STRING()
     ])
     train(work_num, ps_num, python_file, func, property, env_path, zk_conn,
           zk_base_path, stream_env, table_env, input_tb, output_schema)
Example #15
0
def inference(num_worker, num_ps=0, python_file=None, func=None, properties=None, env_path=None, zk_conn=None, zk_base_path=None,
              stream_env=None, table_env=None, input_table=None, output_schema=None):
    """
    Tensorflow inference for Table
    :param num_worker: Number of workers
    :param num_ps: Number of PS
    :param python_file: The python file which is going to be run
    :param func: The user-defined function that runs TF inference. If it's None, inference is run via Java API.
    :param properties: User-defined properties
    :param env_path: Path to the virtual env
    :param stream_env: The StreamExecutionEnvironment. If it's None, this method will create one and execute the job
                       at the end. Otherwise, caller is responsible to trigger the job execution
    :param table_env: The TableEnvironment
    :param zk_conn: The Zookeeper connection string
    :param zk_base_path: The Zookeeper base path
    :param input_table: The input Table
    :param output_schema: The TableSchema of the output Table. If it's None, a dummy sink will be added to the output
                          Table. Otherwise, caller is responsible to add sink before executing the job.
    :return: The output Table
    """
    tf_config = TFConfig(num_worker, num_ps, python_file, func, properties, env_path, zk_conn, zk_base_path)
    if stream_env is None:
        stream_env = StreamExecutionEnvironment.get_execution_environment()
    if table_env is None:
        table_env = StreamTableEnvironment.create(stream_env)
    if input_table is not None:
        input_table = input_table._j_table
    if output_schema is not None:
        output_schema = output_schema._j_table_schema
    output_table = get_gateway().jvm.com.alibaba.flink.ml.tensorflow.client.TFUtils.inference(
                                                                                           stream_env._j_stream_execution_environment,
                                                                                           table_env._j_tenv,
                                                                                           input_table,
                                                                                           tf_config.java_config(),
                                                                                           output_schema)
    table_env.execute(job_name="table inference")
    return Table(output_table)
Example #16
0
    def add_train_chief_alone_table():
        stream_env = StreamExecutionEnvironment.get_execution_environment()
        table_env = StreamTableEnvironment.create(stream_env)
        statement_set = table_env.create_statement_set()
        work_num = 2
        ps_num = 1
        python_file = os.getcwd() + "/../../src/test/python/add.py"
        func = "map_func"
        prop = {}
        prop[TFCONSTANS.TF_IS_CHIEF_ALONE] = "true"
        prop[MLCONSTANTS.PYTHON_VERSION] = "3.7"
        env_path = None
        input_tb = None
        output_schema = None

        tf_config = TFConfig(work_num, ps_num, prop, python_file, func, env_path)

        train(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        # inference(stream_env, table_env, statement_set, input_tb, tf_config, output_schema)

        job_client = statement_set.execute().get_job_client()
        if job_client is not None:
            job_client.get_job_execution_result(user_class_loader=None).result()
Example #17
0
 def setUp(self):
     super(PyFlinkBatchTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.env.set_runtime_mode(RuntimeExecutionMode.BATCH)
Example #18
0
 def setUp(self):
     super(PyFlinkStreamingTestCase, self).setUp()
     self.env = StreamExecutionEnvironment.get_execution_environment()
     self.env.set_parallelism(2)
     self.env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
     self.env._remote_mode = True