Esempio n. 1
0
def main():
    options = parse_args()

    python_engine = PythonEngine("s3_file_source")
    s3_source = S3FileSource(python_engine)
    s3_source.configure(vars(options))
    output_list = s3_source.materialize([])

    print("File downloaded to: {}".format(output_list[0]))
Esempio n. 2
0
    def _init_ml_engine(self, pipeline):
        engine_type = pipeline[json_fields.PIPELINE_ENGINE_TYPE_FIELD]
        self._logger.info("Engine type: {}".format(engine_type))
        if engine_type == EngineType.PY_SPARK:
            from parallelm.ml_engine.py_spark_engine import PySparkEngine

            self._ml_engine = PySparkEngine(
                pipeline[json_fields.PIPELINE_NAME_FIELD], self._run_locally,
                self._spark_jars)
            self.set_logger(
                self._ml_engine.get_engine_logger(self.logger_name()))
            if mlops_loaded:
                mlops.init(self._ml_engine.context)

        elif engine_type == EngineType.GENERIC:
            from parallelm.ml_engine.python_engine import PythonEngine

            self._logger.info("Using python engine")
            self._ml_engine = PythonEngine(
                pipeline[json_fields.PIPELINE_NAME_FIELD], self._mlcomp_jar)
            self.set_logger(
                self._ml_engine.get_engine_logger(self.logger_name()))
            if mlops_loaded:
                # This initialization applies only to Python components and not to components
                # that are written in other languages (.e.g R). The reason for that is that
                # those components are executed within different process and thus need to
                # load and init the mlops library separately.
                mlops.init()

        elif engine_type == EngineType.REST_MODEL_SERVING:
            from parallelm.ml_engine.rest_model_serving_engine import RestModelServingEngine

            self._logger.info("Using REST Model Serving engine")
            self._ml_engine = RestModelServingEngine(
                pipeline[json_fields.PIPELINE_NAME_FIELD], self._mlcomp_jar,
                self._standalone)
            self.set_logger(
                self._ml_engine.get_engine_logger(self.logger_name()))
            if mlops_loaded:
                # This initialization applies only to Python components and not to components
                # that are written in other languages (.e.g R). The reason for that is that
                # those components are executed within different process and thus need to
                # load and init the mlops library separately.
                mlops.init()

        else:
            raise MLCompException(
                "Engine type is not supported by the Python execution engine! engineType: "
                + engine_type)

        if mlops_loaded:
            self._ml_engine.run(mlops, pipeline)
Esempio n. 3
0
def main():
    options = parse_args()

    params = vars(options)
    input_file = params["input_file"]
    params.pop("input_file")

    python_engine = PythonEngine("s3_file_source")
    s3_source = S3FileSink(python_engine)
    s3_source.configure(vars(options))

    parent_objects = [input_file]
    s3_source.materialize(parent_objects)
Esempio n. 4
0
    def test_python_stand_alone_argument_building(self):
        systemConfig = {
            "statsDBHost": "localhost",
            "statsDBPort": 8899,
            "statsMeasurementID": "tf-job-0001",
            "mlObjectSocketHost": "localhost",
            "mlObjectSocketSourcePort": 9900,
            "mlObjectSocketSinkPort": 9901,
            "modelFileSinkPath": "output-model-1234",
            "modelFileSourcePath": "input-model-1234",
            "healthStatFilePath": "/tmp/health",
            "workflowInstanceId": "/tmp/run/filesink1",
            "socketSourcePort": 0,
            "socketSinkPort": 0,
            "enableHealth": True,
            "canaryThreshold": 0.0
        }
        pipeline = {
            "name":
            "stand_alone_test",
            "engineType":
            "Generic",
            "pipe": [{
                "name": "Test Train",
                "id": 1,
                "type": "test-python-train",
                "parents": [],
                "arguments": {
                    "arg1": "arg1-value"
                }
            }]
        }
        python_engine = PythonEngine("test-pipe")
        comps_desc_list = ComponentsDesc(python_engine,
                                         pipeline=pipeline).load()
        dag = Dag(pipeline, comps_desc_list, python_engine)

        dag_node = dag.get_dag_node(0)
        input_args = dag_node.input_arguments(systemConfig,
                                              comp_only_args=True)
        assert input_args["arg1"] == "arg1-value"
        assert input_args["output-model"] == "output-model-1234"
Esempio n. 5
0
    def test_dag_detect_is_stand_alone(self):

        pipeline = {
            "name":
            "stand_alone_test",
            "engineType":
            "Generic",
            "pipe": [{
                "name": "Hello",
                "id": 1,
                "type": "hello-world",
                "parents": [],
                "arguments": {
                    "arg1": "arg1-value"
                }
            }]
        }
        python_engine = PythonEngine("test-pipe")
        comps_desc_list = ComponentsDesc(python_engine,
                                         pipeline=pipeline).load()
        dag = Dag(pipeline, comps_desc_list, python_engine)
        assert dag.is_stand_alone is True
Esempio n. 6
0
    def test_correct_python_component_io(self):
        pipeline = {
            "name":
            "stand_alone_test",
            "engineType":
            "Generic",
            "pipe": [{
                "name": "Test Train 1",
                "id": 1,
                "type": "test-python-train",
                "parents": [],
                "arguments": {
                    "arg1": "arg1-value"
                }
            }, {
                "name":
                "Test Train 2",
                "id":
                2,
                "type":
                "test-python-train",
                "parents": [{
                    "parent": 1,
                    "output": 1,
                    "input": 1
                }, {
                    "parent": 1,
                    "output": 0,
                    "input": 0
                }],
                "arguments": {
                    "arg1": "arg1-value"
                }
            }, {
                "name":
                "Test Train 3",
                "id":
                3,
                "type":
                "test-python-train",
                "parents": [{
                    "parent": 2,
                    "output": 0,
                    "input": 0
                }, {
                    "parent": 2,
                    "output": 2,
                    "input": 2
                }, {
                    "parent": 2,
                    "output": 1,
                    "input": 1
                }],
                "arguments": {
                    "arg1": "arg1-value"
                }
            }, {
                "name":
                "Test Train 4",
                "id":
                4,
                "type":
                "test-python-train",
                "parents": [{
                    "parent": 3,
                    "output": 0,
                    "input": 1
                }, {
                    "parent": 3,
                    "output": 1,
                    "input": 0
                }],
                "arguments": {
                    "arg1": "arg1-value"
                }
            }]
        }
        python_engine = PythonEngine("test-pipe")
        comps_desc_list = ComponentsDesc(python_engine,
                                         pipeline=pipeline).load()
        dag = Dag(pipeline, comps_desc_list, python_engine)

        dag_node_1 = dag.get_dag_node(0)
        dag_node_2 = dag.get_dag_node(1)
        dag_node_3 = dag.get_dag_node(2)
        dag_node_4 = dag.get_dag_node(3)

        # A100 means -- Type A, Node Id 1, Output 0, Goes To 0
        # pipeline is as follow

        #     OUTPUT INDEX 0 - INPUT INDEX 0      OUTPUT INDEX 0 - INPUT INDEX 0      OUTPUT INDEX 0   INPUT INDEX 0
        #    /                              \    /                              \    /              \ /             \
        # ID 1                               ID 2-OUTPUT INDEX 1 - INPUT INDEX 1-ID 3                /\              ID 4
        #    \                              /    \                              /    \              /  \            /
        #     OUTPUT INDEX 1 - INPUT INDEX 1      OUTPUT INDEX 2 - INPUT INDEX 2      OUTPUT INDEX 1    INPUT INDEX 1

        dag.update_parent_data_objs(dag_node_1, ["A100", "B111"])
        dag.update_parent_data_objs(dag_node_2, ["A200", "B211", "C222"])
        dag.update_parent_data_objs(dag_node_3, ["A301", "B310"])

        # as node 1 does not have any parents, input object should be empty
        assert dag.parent_data_objs(dag_node_1) == []
        # as node 2 have input coming but json is not correctly order, but still output should be correctly indexed
        assert dag.parent_data_objs(dag_node_2) == ["A100", "B111"]
        # little complicated node 3 inputs. but same story as above
        assert dag.parent_data_objs(dag_node_3) == ["A200", "B211", "C222"]
        # node 4 gets output of node3's index 0 to its 1st input index and node3's output index 1 to its 0th input indexx
        assert dag.parent_data_objs(dag_node_4) == ["B310", "A301"]
Esempio n. 7
0
    def _init_ml_engine(self, pipeline):
        engine_type = pipeline[json_fields.PIPELINE_ENGINE_TYPE_FIELD]
        self._logger.info("Engine type: {}".format(engine_type))
        if engine_type == EngineType.PY_SPARK:
            from parallelm.ml_engine.py_spark_engine import PySparkEngine

            self._ml_engine = PySparkEngine(pipeline, self._run_locally,
                                            self._spark_jars)
            if mlops_loaded:
                mlops.init(self._ml_engine.context)
                mlops.set_uuid(self._uuid)

        elif engine_type in [
                EngineType.GENERIC, EngineType.REST_MODEL_SERVING,
                EngineType.SAGEMAKER
        ]:
            # All are supposed to be derived from python engine

            if engine_type == EngineType.GENERIC:
                from parallelm.ml_engine.python_engine import PythonEngine

                self._logger.info("Using python engine")
                self._ml_engine = PythonEngine(pipeline, self._mlcomp_jar)

                self.set_logger(
                    self._ml_engine.get_engine_logger(self.logger_name()))
                if mlops_loaded:
                    # This initialization applies only to Python components and not to components
                    # that are written in other languages (.e.g R). The reason for that is that
                    # those components are executed within different process and thus need to
                    # load and init the mlops library separately.
                    mlops.init()
                    mlops.set_uuid(self._uuid)

            elif engine_type == EngineType.REST_MODEL_SERVING:
                from parallelm.ml_engine.rest_model_serving_engine import RestModelServingEngine

                self._logger.info("Using REST Model Serving engine")
                self._ml_engine = RestModelServingEngine(
                    pipeline, self._mlcomp_jar, self._standalone)

            elif engine_type == EngineType.SAGEMAKER:
                from parallelm.ml_engine.sagemaker_engine import SageMakerEngine

                self._logger.info("Using SageMaker engine")
                self._ml_engine = SageMakerEngine(pipeline)

            if mlops_loaded:
                # This initialization applies only to Python components and not to components
                # that are written in other languages (.e.g R). The reason for that is that
                # those components are executed within different process and thus need to
                # load and init the mlops library separately.

                from os import environ
                from parallelm.components.restful.constants import RestfulConstants
                if environ.get(
                        RestfulConstants.STATS_AGGREGATE_FLAG) is not None:
                    self._logger.info("Using the accumulator channel")
                    mlops.init(mlops_mode=MLOpsMode.REST_ACCUMULATOR)
                else:
                    self._logger.info("Using the standard channel")
                    mlops.init()
                mlops.set_uuid(self._uuid)

        else:
            raise MLCompException(
                "Engine type is not supported by the Python execution engine! engineType: {}"
                .format(engine_type))

        if mlops_loaded:
            self._ml_engine.set_uuid(self._uuid)
            self._ml_engine.run(mlops, pipeline)