def test_local_py(): file_path = f"{examples_path}/training.py" mod = function_to_module(file_path) task = new_task(inputs={"infile.txt": f"{examples_path}/infile.txt"}) context = get_or_create_ctx("myfunc", spec=task) mod.my_job(context, p1=2, p2="x") assert context.results["accuracy"] == 4, "failed to run"
def dask_pipe(x=1, y=10): # use_db option will use a function (DB) pointer instead of adding the function spec to the YAML self.dask_function.as_step( new_task(handler="main", name="dask_pipeline", params={"x": x, "y": y}), use_db=True, )
def test_run_training_job(self): output_path = str(self.results_path / "{{run.uid}}") self._logger.debug("Creating base task") base_task = new_task(artifact_path=output_path).set_label("stage", "dev") # run our training task, with hyper params, and select the one with max accuracy self._logger.debug("Running task with hyper params") train_task = new_task( name="my-training", handler="training", params={"p1": 9}, base=base_task ) train_run = self._trainer.run(train_task) # running validation, use the model result from the previous step self._logger.debug("Running validation using the model from the previous step") model = train_run.outputs["mymodel"] self._trainer.run(base_task, handler="validation", inputs={"model": model})
def _generate_task(p1, out_path): return new_task( params={ "p1": p1 }, out_path=out_path, outputs=["accuracy", "loss"], ).set_label("tests", "kfp")
def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service=None): name = name or f"{featureset.metadata.name}_ingest" use_spark = featureset.spec.engine == "spark" if use_spark and not run_config.local and not spark_service: raise mlrun.errors.MLRunInvalidArgumentError( "Remote spark ingestion requires the spark service name to be provided" ) default_kind = RuntimeKinds.remotespark if use_spark else RuntimeKinds.job spark_runtimes = [RuntimeKinds.remotespark] # may support spark operator in future if not run_config.function: function_ref = featureset.spec.function.copy() if function_ref.is_empty(): function_ref = FunctionReference(name=name, kind=default_kind) if not function_ref.url: function_ref.code = (function_ref.code or "") + _default_job_handler run_config.function = function_ref run_config.handler = "handler" image = None if use_spark else mlrun.mlconf.feature_store.default_job_image function = run_config.to_function(default_kind, image) if use_spark and function.kind not in spark_runtimes: raise mlrun.errors.MLRunInvalidArgumentError( "ingest with spark engine require spark function kind" ) function.metadata.project = featureset.metadata.project function.metadata.name = function.metadata.name or name if not use_spark and not function.spec.image: raise mlrun.errors.MLRunInvalidArgumentError("function image must be specified") if use_spark and not run_config.local: function.with_spark_service(spark_service=spark_service) task = mlrun.new_task( name=name, params=run_config.parameters, handler=run_config.handler, out_path=featureset.spec.output_path, ) task.spec.secret_sources = run_config.secret_sources task.set_label("job-type", "feature-ingest").set_label( "feature-set", featureset.uri ) # set run UID and save in the feature set status (linking the features et to the job) task.metadata.uid = uuid.uuid4().hex featureset.status.run_uri = task.metadata.uid featureset.save() run = function.run( task, schedule=schedule, local=run_config.local, watch=run_config.watch ) if run_config.watch: featureset.reload() return run
def custom_setup(self): self._logger.debug("Creating basics task") # {{run.uid}} will be substituted with the run id, so output will be written to different directories per run output_path = str(self.results_path / "{{run.uid}}") self._basics_task = (new_task( name="demo", params={ "p1": 5 }, artifact_path=output_path).with_secrets( "file", self.assets_path / "secrets.txt").set_label("type", "demo")) self._logger.debug("Creating inline task") self._inline_task = new_task( name="demo2", handler=self._get_inline_handler(), artifact_path=str(self.results_path / "{{run.uid}}"), )
def test_vault_end_to_end(): # This requires an MLRun API server to run and work with Vault. This port should # be configured to allow access to the server. api_server_port = 57764 _set_vault_mlrun_configuration(api_server_port) project_name = "abc" func_name = "vault-function" aws_key_value = "1234567890" github_key_value = "proj1Key!!!" project = new_project(project_name) # This call will initialize Vault infrastructure and add the given secrets # It executes on the API server project.create_vault_secrets({ "aws_key": aws_key_value, "github_key": github_key_value }) # This API executes on the client side project_secrets = project.get_vault_secret_keys() assert project_secrets == ["aws_key", "github_key"], "secrets not created" # Create function and set container configuration function = code_to_function( name=func_name, filename="{}/vault_function.py".format(examples_path), handler="vault_func", project=project_name, kind="job", ) function.spec.image = "saarcoiguazio/mlrun:unstable" # Create context for the execution spec = new_task( project=project_name, name="vault_test_run", handler="vault_func", out_path=out_path, params={"secrets": ["password", "path", "github_key", "aws_key"]}, ) spec.with_secrets("vault", []) result = function.run(spec) verify_state(result) db = get_run_db().connect() state, log = db.get_log(result.metadata.uid, project=project_name) log = str(log) print(state) assert (log.find("value: {}".format(aws_key_value)) != -1), "secret value not detected in function output" assert (log.find("value: {}".format(github_key_value)) != -1), "secret value not detected in function output"
def test_run_local(): if Path(ARTIFACTS_PATH).is_dir(): shutil.rmtree(ARTIFACTS_PATH) task = new_task(name="task-feature-selection", handler = feature_selection, params={'k': 2, 'min_votes': 0.3, 'label_column': 'is_error'}, inputs={'df_artifact': 'data/metrics.pq'}, ) run_local(task=task, artifact_path=os.path.join(os.path.abspath('./'), 'artifacts')) _validate_paths({'feature_scores.parquet', 'selected_features.parquet'})
def test_describe_dask_local(): if Path(PLOTS_PATH).is_dir(): shutil.rmtree(PLOTS_PATH) task = new_task(name="task-describe", handler=summarize, inputs={"table": DATA_URL}, params={ 'update_dataset': True, 'label_column': 'label', 'dask_function': 'db://default/dask_tests' }) run_local(task) _validate_paths({ 'corr.html', 'correlation-matrix.csv', 'hist.html', 'imbalance.html', 'imbalance-weights-vec.csv', 'violin.html' })
def run_ingestion_job(name, featureset, run_config, schedule=None): name = name or f"{featureset.metadata.name}_ingest" if not run_config.function: function_ref = featureset.spec.function.copy() if function_ref.is_empty(): function_ref = FunctionReference(name=name, kind=RuntimeKinds.job) if not function_ref.url: code = function_ref.code or "" if run_config.kind == RuntimeKinds.remotespark: function_ref.code = code + _default_spark_handler else: function_ref.code = code + _default_job_handler run_config.function = function_ref run_config.handler = "handler" image = (_default_spark_image() if run_config.kind == RuntimeKinds.remotespark else mlrun.mlconf.feature_store.default_job_image) function = run_config.to_function("job", image) function.metadata.project = featureset.metadata.project function.metadata.name = function.metadata.name or name if not function.spec.image: raise mlrun.errors.MLRunInvalidArgumentError( "function image must be specified") task = mlrun.new_task(name=name, params=run_config.parameters, handler=run_config.handler) task.spec.secret_sources = run_config.secret_sources task.set_label("job-type", "feature-ingest").set_label("feature-set", featureset.uri) # set run UID and save in the feature set status (linking the features et to the job) task.metadata.uid = uuid.uuid4().hex featureset.status.run_uri = task.metadata.uid featureset.save() run = function.run(task, schedule=schedule, local=run_config.local, watch=run_config.watch) if run_config.watch: featureset.reload() return run
def test_run_local(): if Path(PLOTS_PATH).is_dir(): shutil.rmtree(PLOTS_PATH) task = new_task( name="task-describe", handler=summarize, inputs={"table": DATA_URL}, params={ "update_dataset": True, "label_column": "label" }, ) run_local(task) _validate_paths({ "corr.html", "correlation-matrix.csv", "hist.html", "imbalance.html", "imbalance-weights-vec.csv", "violin.html", })
def test_hyper_parallel_with_stop(): list_params = '{"p2": [2,3,7,4,5], "p3": [10,10,10,10,10]}' mlrun.datastore.set_in_memory_item("params.json", list_params) run_spec = mlrun.new_task(params={"p1": 1}) run_spec.with_hyper_params( { "p2": [2, 3, 7, 4, 5], "p3": [10, 10, 10, 10, 10] }, parallel_runs=2, selector="max.r1", strategy="list", stop_condition="r1>=70", ) run = new_function().run(run_spec, handler=hyper_func) verify_state(run) # result: r1 = p2 * p3, r1 >= 70 lead to stop on third run # may have one extra iterations in flight so checking both 4 or 5 assert len(run.status.iterations) in [4, 5], "wrong number of iterations" assert run.output("best_iteration") == 3, "wrong best iteration"
def test_azure_vault_end_to_end(): mlconf.dbpath = f"http://localhost:{api_db_port}" project_name = "proj1" # Create function and set container configuration function = code_to_function( name="azure_vault_func", filename="vault_function.py", handler="vault_func", project=project_name, kind="job", ) function.spec.image = "mlrun/mlrun:unstable" # Create context for the execution spec = new_task( project=project_name, name="azure_vault_test_run", handler="vault_func", out_path=out_path, params={"secrets": ["demo-key-1", "demo-key-2"]}, ) spec.with_secrets( "azure_vault", { "name": "saar-key-vault", "k8s_secret": azure_key_vault_k8s_secret, "secrets": [], }, ) result = function.run(spec) verify_state(result) db = get_run_db().connect() db.get_log(result.metadata.uid, project=project_name)
def custom_setup(self): self._logger.debug("Connecting to database") self._logger.debug("Creating dummy task for db queries") # {{run.uid}} will be substituted with the run id, so output will be written to different directories per run output_path = str(self.results_path / "{{run.uid}}") task = (new_task(name="demo", params={ "p1": 5 }, artifact_path=output_path).with_secrets( "file", self.assets_path / "secrets.txt").set_label( "type", "demo")) self._logger.debug("Running dummy task") run_object = run_local(task, command="training.py", workdir=str(self.assets_path)) self._logger.debug("Finished running dummy task", run_object=run_object.to_dict()) self._run_uid = run_object.uid()
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import getpass from os import path, environ from mlrun import new_task, run_local, code_to_function from tests.conftest import ( examples_path, out_path, tag_test, verify_state, ) base_spec = new_task(params={"p1": 8}, out_path=out_path) base_spec.spec.inputs = {"infile.txt": "infile.txt"} def test_run_local(): spec = tag_test(base_spec, "test_run_local") result = run_local(spec, command=f"{examples_path}/training.py", workdir=examples_path) verify_state(result) def test_run_local_with_uid_does_not_exist(monkeypatch): """ Mocking a scenario that happened in field in which getuser raised the same error as the mock The problem was basically that the code was
def test_db_commands(self): self._logger.debug("Creating dummy task for db queries") # {{run.uid}} will be substituted with the run id, so output will be written to different directories per run output_path = str(self.results_path / "{{run.uid}}") task = (new_task(name="demo", params={ "p1": 5 }, artifact_path=output_path).with_secrets( "file", self.assets_path / "secrets.txt").set_label( "type", "demo")) runs_count_before_run = len( self._run_db.list_runs(project=self.project_name)) artifacts_count_before_run = len( self._run_db.list_artifacts(project=self.project_name, tag="*")) self._logger.debug("Running dummy task") run_object = run_local(task, command="training.py", workdir=str(self.assets_path)) self._logger.debug("Finished running dummy task", run_object=run_object.to_dict()) self._run_uid = run_object.uid() runs = self._run_db.list_runs(project=self.project_name) assert len(runs) == runs_count_before_run + 1 self._verify_run_metadata( runs[0]["metadata"], uid=self._run_uid, name="demo", project=self.project_name, labels={ "kind": "", "framework": "sklearn" }, ) self._verify_run_spec( runs[0]["spec"], parameters={ "p1": 5, "p2": "a-string" }, inputs={"infile.txt": str(self.assets_path / "infile.txt")}, outputs=[], output_path=str(self.results_path / self._run_uid), secret_sources=[], data_stores=[], ) artifacts = self._run_db.list_artifacts(project=self.project_name, tag="*") assert len(artifacts) == artifacts_count_before_run + 4 for artifact_key in ["chart", "html_result", "model", "mydf"]: artifact_exists = False for artifact in artifacts: if artifact["key"] == artifact_key: artifact_exists = True break assert artifact_exists runtimes = self._run_db.list_runtimes() assert len(runtimes) == len( mlrun.runtimes.RuntimeKinds.runtime_with_handlers()) for runtime_kind in mlrun.runtimes.RuntimeKinds.runtime_with_handlers( ): runtime_exists = False for runtime in runtimes: if runtime["kind"] == runtime_kind: runtime_exists = True break assert runtime_exists
def test_dask_local(): spec = tag_test(new_task(params={"p1": 3, "p2": "vv"}), "test_dask_local") function = new_function(kind="dask") function.spec.remote = False run = function.run(spec, handler=my_func) verify_state(run)
def run_function( function: Union[str, mlrun.runtimes.BaseRuntime], handler: str = None, name: str = "", params: dict = None, hyperparams: dict = None, hyper_param_options: mlrun.model.HyperParamOptions = None, inputs: dict = None, outputs: List[str] = None, workdir: str = "", labels: dict = None, base_task: mlrun.model.RunTemplate = None, watch: bool = True, local: bool = False, verbose: bool = None, project_object=None, ) -> Union[mlrun.model.RunObject, kfp.dsl.ContainerOp]: """Run a local or remote task as part of a local/kubeflow pipeline run_function() allow you to execute a function locally, on a remote cluster, or as part of an automated workflow function can be specified as an object or by name (str), when the function is specified by name it is looked up in the current project eliminating the need to redefine/edit functions. when functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level, e.g. local=True will run all the functions locally, setting artifact_path will direct all outputs to the same path. project runs provide additional notifications/reporting and exception handling. inside a Kubeflow pipeline (KFP) run_function() generates KFP "ContainerOps" which are used to form a DAG some behavior may differ between regular runs and deferred KFP runs. example (use with function object):: function = mlrun.import_function("hub://sklearn_classifier") run1 = run_function(function, params={"data": url}) example (use with project):: # create a project with two functions (local and from marketplace) project = mlrun.new_project(project_name, "./proj) project.set_function("mycode.py", "myfunc", image="mlrun/mlrun") project.set_function("hub://sklearn_classifier", "train") # run functions (refer to them by name) run1 = run_function("myfunc", params={"x": 7}) run2 = run_function("train", params={"data": run1.outputs["data"]}) example (use in pipeline):: @dsl.pipeline(name="test pipeline", description="test") def my_pipe(url=""): run1 = run_function("loaddata", params={"url": url}) run2 = run_function("train", params={"data": run1.outputs["data"]}) project.run(workflow_handler=my_pipe, arguments={"param1": 7}) :param function: name of the function (in the project) or function object :param handler: name of the function handler :param name: execution name :param params: input parameters (dict) :param hyperparams: hyper parameters :param hyper_param_options: hyper param options (selector, early stop, strategy, ..) see: :py:class:`~mlrun.model.HyperParamOptions` :param inputs: input objects (dict of key: path) :param outputs: list of outputs which can pass in the workflow :param workdir: default input artifacts path :param labels: labels to tag the job/run with ({key:val, ..}) :param base_task: task object to use as base :param watch: watch/follow run log, True by default :param local: run the function locally vs on the runtime/cluster :param verbose: add verbose prints/logs :return: MLRun RunObject or KubeFlow containerOp """ engine, function = _get_engine_and_function(function, project_object) task = mlrun.new_task( name, handler=handler, params=params, hyper_params=hyperparams, hyper_param_options=hyper_param_options, inputs=inputs, base=base_task, ) task.spec.verbose = task.spec.verbose or verbose if engine == "kfp": return function.as_step( runspec=task, workdir=workdir, outputs=outputs, labels=labels ) else: if pipeline_context.workflow: local = local or pipeline_context.workflow.run_local task.metadata.labels = task.metadata.labels or labels or {} task.metadata.labels["workflow"] = pipeline_context.workflow_id run_result = function.run( runspec=task, workdir=workdir, verbose=verbose, watch=watch, local=local, artifact_path=pipeline_context.workflow_artifact_path, ) if run_result: run_result._notified = False pipeline_context.runs_map[run_result.uid()] = run_result run_result.after = ( lambda x: run_result ) # emulate KFP op, .after() will be ignored return run_result
def run_ingestion_job( featureset: Union[FeatureSet, str], source: DataSource = None, targets: List[DataTargetBase] = None, name: str = None, infer_options: InferOptions = InferOptions.default(), parameters: Dict[str, Union[str, list, dict]] = None, function=None, local=False, watch=True, auto_mount=False, engine=None, secrets=None, handler=None, ): """Start batch ingestion task using remote MLRun job or spark function Deploy and run batch job implementing feature ingestion pipeline sources will deploy mlrun python or spark jobs (use the `engine` attribute to select spark), for scheduled jobs set the schedule attribute in the offline source. example:: source = CSVSource("mycsv", path="measurements.csv") targets = [CSVTarget("mycsv", path="./mycsv.csv")] run_ingestion_job(measurements, source, targets, name="tst_ingest") :param featureset: feature set object or uri :param source: data source object describing the online or offline source :param targets: list of data target objects :param name: name name for the job/function :param infer_options: schema and stats infer options :param parameters: extra parameter dictionary which is passed to the graph context :param function: custom ingestion function :param local: run local emulation using mock_server() or run_local() :param watch: wait for job completion, set to False if you dont want to wait :param auto_mount: add PVC or v3io volume to the function (using mlrun.platform.auto_mount) :param engine: ingestion engine, set to "spark" for using Spark :param secrets: key/value dictionary for secrets (for data credential vars) :param handler: run specific handler/method in the function """ if isinstance(featureset, str): featureset = get_feature_set_by_uri(featureset) source, parameters = set_task_params(featureset, source, targets, parameters, infer_options) name = name or f"{featureset.metadata.name}_ingest" function = default_ingestion_job_function(name, featureset, engine, function) if auto_mount: function.apply(mlrun.platforms.auto_mount()) function.metadata.project = featureset.metadata.project task = mlrun.new_task(name=name, params=parameters, handler=handler) if secrets: task.with_secrets("inline", secrets) # todo: replace with vault # set run UID and save in the feature set status (linking the features et to the job) task.metadata.uid = uuid.uuid4().hex featureset.status.run_uri = task.metadata.uid featureset.save() run = function.run(task, schedule=source.schedule, local=local, watch=watch) if watch: featureset.reload() return run
"first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"], "last_name": ["Miller", "Jacobson", "Ali", "Milner", "Cooze"], "x": np.array([1, 2, 3.2, np.nan, 5.5]), "y": [25, 94, 0.1, 57, datetime.datetime(2018, 1, 1)], } df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "x", "y"]) context.log_dataset("df1", df=df, format="csv") date_rng = pd.date_range("2018-01-01", periods=4, freq="H") df = pd.DataFrame(date_rng, columns=["date"]) df["data"] = np.random.rand(4) df["nan"] = np.nan df["datetime"] = pd.to_datetime(df["date"]) df["text"] = "x" df = df.set_index("datetime") context.log_dataset("df2", df=df) return np.nan base_spec = new_task(artifact_path=out_path, handler=my_func) def test_serialization(): spec = tag_test(base_spec, "test_serialization") result = run_local(spec) verify_state(result) pprint(result.to_dict()) print(result.to_yaml()) pprint(result.to_json())