def main(): options = parse_args() python_engine = PythonEngine("s3_file_source") s3_source = S3FileSource(python_engine) s3_source.configure(vars(options)) output_list = s3_source.materialize([]) print("File downloaded to: {}".format(output_list[0]))
def _init_ml_engine(self, pipeline): engine_type = pipeline[json_fields.PIPELINE_ENGINE_TYPE_FIELD] self._logger.info("Engine type: {}".format(engine_type)) if engine_type == EngineType.PY_SPARK: from parallelm.ml_engine.py_spark_engine import PySparkEngine self._ml_engine = PySparkEngine( pipeline[json_fields.PIPELINE_NAME_FIELD], self._run_locally, self._spark_jars) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: mlops.init(self._ml_engine.context) elif engine_type == EngineType.GENERIC: from parallelm.ml_engine.python_engine import PythonEngine self._logger.info("Using python engine") self._ml_engine = PythonEngine( pipeline[json_fields.PIPELINE_NAME_FIELD], self._mlcomp_jar) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: # This initialization applies only to Python components and not to components # that are written in other languages (.e.g R). The reason for that is that # those components are executed within different process and thus need to # load and init the mlops library separately. mlops.init() elif engine_type == EngineType.REST_MODEL_SERVING: from parallelm.ml_engine.rest_model_serving_engine import RestModelServingEngine self._logger.info("Using REST Model Serving engine") self._ml_engine = RestModelServingEngine( pipeline[json_fields.PIPELINE_NAME_FIELD], self._mlcomp_jar, self._standalone) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: # This initialization applies only to Python components and not to components # that are written in other languages (.e.g R). The reason for that is that # those components are executed within different process and thus need to # load and init the mlops library separately. mlops.init() else: raise MLCompException( "Engine type is not supported by the Python execution engine! engineType: " + engine_type) if mlops_loaded: self._ml_engine.run(mlops, pipeline)
def main(): options = parse_args() params = vars(options) input_file = params["input_file"] params.pop("input_file") python_engine = PythonEngine("s3_file_source") s3_source = S3FileSink(python_engine) s3_source.configure(vars(options)) parent_objects = [input_file] s3_source.materialize(parent_objects)
def test_python_stand_alone_argument_building(self): systemConfig = { "statsDBHost": "localhost", "statsDBPort": 8899, "statsMeasurementID": "tf-job-0001", "mlObjectSocketHost": "localhost", "mlObjectSocketSourcePort": 9900, "mlObjectSocketSinkPort": 9901, "modelFileSinkPath": "output-model-1234", "modelFileSourcePath": "input-model-1234", "healthStatFilePath": "/tmp/health", "workflowInstanceId": "/tmp/run/filesink1", "socketSourcePort": 0, "socketSinkPort": 0, "enableHealth": True, "canaryThreshold": 0.0 } pipeline = { "name": "stand_alone_test", "engineType": "Generic", "pipe": [{ "name": "Test Train", "id": 1, "type": "test-python-train", "parents": [], "arguments": { "arg1": "arg1-value" } }] } python_engine = PythonEngine("test-pipe") comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) dag_node = dag.get_dag_node(0) input_args = dag_node.input_arguments(systemConfig, comp_only_args=True) assert input_args["arg1"] == "arg1-value" assert input_args["output-model"] == "output-model-1234"
def test_dag_detect_is_stand_alone(self): pipeline = { "name": "stand_alone_test", "engineType": "Generic", "pipe": [{ "name": "Hello", "id": 1, "type": "hello-world", "parents": [], "arguments": { "arg1": "arg1-value" } }] } python_engine = PythonEngine("test-pipe") comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) assert dag.is_stand_alone is True
def test_correct_python_component_io(self): pipeline = { "name": "stand_alone_test", "engineType": "Generic", "pipe": [{ "name": "Test Train 1", "id": 1, "type": "test-python-train", "parents": [], "arguments": { "arg1": "arg1-value" } }, { "name": "Test Train 2", "id": 2, "type": "test-python-train", "parents": [{ "parent": 1, "output": 1, "input": 1 }, { "parent": 1, "output": 0, "input": 0 }], "arguments": { "arg1": "arg1-value" } }, { "name": "Test Train 3", "id": 3, "type": "test-python-train", "parents": [{ "parent": 2, "output": 0, "input": 0 }, { "parent": 2, "output": 2, "input": 2 }, { "parent": 2, "output": 1, "input": 1 }], "arguments": { "arg1": "arg1-value" } }, { "name": "Test Train 4", "id": 4, "type": "test-python-train", "parents": [{ "parent": 3, "output": 0, "input": 1 }, { "parent": 3, "output": 1, "input": 0 }], "arguments": { "arg1": "arg1-value" } }] } python_engine = PythonEngine("test-pipe") comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) dag_node_1 = dag.get_dag_node(0) dag_node_2 = dag.get_dag_node(1) dag_node_3 = dag.get_dag_node(2) dag_node_4 = dag.get_dag_node(3) # A100 means -- Type A, Node Id 1, Output 0, Goes To 0 # pipeline is as follow # OUTPUT INDEX 0 - INPUT INDEX 0 OUTPUT INDEX 0 - INPUT INDEX 0 OUTPUT INDEX 0 INPUT INDEX 0 # / \ / \ / \ / \ # ID 1 ID 2-OUTPUT INDEX 1 - INPUT INDEX 1-ID 3 /\ ID 4 # \ / \ / \ / \ / # OUTPUT INDEX 1 - INPUT INDEX 1 OUTPUT INDEX 2 - INPUT INDEX 2 OUTPUT INDEX 1 INPUT INDEX 1 dag.update_parent_data_objs(dag_node_1, ["A100", "B111"]) dag.update_parent_data_objs(dag_node_2, ["A200", "B211", "C222"]) dag.update_parent_data_objs(dag_node_3, ["A301", "B310"]) # as node 1 does not have any parents, input object should be empty assert dag.parent_data_objs(dag_node_1) == [] # as node 2 have input coming but json is not correctly order, but still output should be correctly indexed assert dag.parent_data_objs(dag_node_2) == ["A100", "B111"] # little complicated node 3 inputs. but same story as above assert dag.parent_data_objs(dag_node_3) == ["A200", "B211", "C222"] # node 4 gets output of node3's index 0 to its 1st input index and node3's output index 1 to its 0th input indexx assert dag.parent_data_objs(dag_node_4) == ["B310", "A301"]
def _init_ml_engine(self, pipeline): engine_type = pipeline[json_fields.PIPELINE_ENGINE_TYPE_FIELD] self._logger.info("Engine type: {}".format(engine_type)) if engine_type == EngineType.PY_SPARK: from parallelm.ml_engine.py_spark_engine import PySparkEngine self._ml_engine = PySparkEngine(pipeline, self._run_locally, self._spark_jars) if mlops_loaded: mlops.init(self._ml_engine.context) mlops.set_uuid(self._uuid) elif engine_type in [ EngineType.GENERIC, EngineType.REST_MODEL_SERVING, EngineType.SAGEMAKER ]: # All are supposed to be derived from python engine if engine_type == EngineType.GENERIC: from parallelm.ml_engine.python_engine import PythonEngine self._logger.info("Using python engine") self._ml_engine = PythonEngine(pipeline, self._mlcomp_jar) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: # This initialization applies only to Python components and not to components # that are written in other languages (.e.g R). The reason for that is that # those components are executed within different process and thus need to # load and init the mlops library separately. mlops.init() mlops.set_uuid(self._uuid) elif engine_type == EngineType.REST_MODEL_SERVING: from parallelm.ml_engine.rest_model_serving_engine import RestModelServingEngine self._logger.info("Using REST Model Serving engine") self._ml_engine = RestModelServingEngine( pipeline, self._mlcomp_jar, self._standalone) elif engine_type == EngineType.SAGEMAKER: from parallelm.ml_engine.sagemaker_engine import SageMakerEngine self._logger.info("Using SageMaker engine") self._ml_engine = SageMakerEngine(pipeline) if mlops_loaded: # This initialization applies only to Python components and not to components # that are written in other languages (.e.g R). The reason for that is that # those components are executed within different process and thus need to # load and init the mlops library separately. from os import environ from parallelm.components.restful.constants import RestfulConstants if environ.get( RestfulConstants.STATS_AGGREGATE_FLAG) is not None: self._logger.info("Using the accumulator channel") mlops.init(mlops_mode=MLOpsMode.REST_ACCUMULATOR) else: self._logger.info("Using the standard channel") mlops.init() mlops.set_uuid(self._uuid) else: raise MLCompException( "Engine type is not supported by the Python execution engine! engineType: {}" .format(engine_type)) if mlops_loaded: self._ml_engine.set_uuid(self._uuid) self._ml_engine.run(mlops, pipeline)