Esempio n. 1
0
def test_local_py():
    file_path = f"{examples_path}/training.py"
    mod = function_to_module(file_path)
    task = new_task(inputs={"infile.txt": f"{examples_path}/infile.txt"})
    context = get_or_create_ctx("myfunc", spec=task)
    mod.my_job(context, p1=2, p2="x")
    assert context.results["accuracy"] == 4, "failed to run"
Esempio n. 2
0
        def dask_pipe(x=1, y=10):

            # use_db option will use a function (DB) pointer instead of adding the function spec to the YAML
            self.dask_function.as_step(
                new_task(handler="main", name="dask_pipeline", params={"x": x, "y": y}),
                use_db=True,
            )
Esempio n. 3
0
    def test_run_training_job(self):
        output_path = str(self.results_path / "{{run.uid}}")

        self._logger.debug("Creating base task")
        base_task = new_task(artifact_path=output_path).set_label("stage", "dev")

        # run our training task, with hyper params, and select the one with max accuracy
        self._logger.debug("Running task with hyper params")
        train_task = new_task(
            name="my-training", handler="training", params={"p1": 9}, base=base_task
        )
        train_run = self._trainer.run(train_task)

        # running validation, use the model result from the previous step
        self._logger.debug("Running validation using the model from the previous step")
        model = train_run.outputs["mymodel"]
        self._trainer.run(base_task, handler="validation", inputs={"model": model})
Esempio n. 4
0
def _generate_task(p1, out_path):
    return new_task(
        params={
            "p1": p1
        },
        out_path=out_path,
        outputs=["accuracy", "loss"],
    ).set_label("tests", "kfp")
Esempio n. 5
0
def run_ingestion_job(name, featureset, run_config, schedule=None, spark_service=None):
    name = name or f"{featureset.metadata.name}_ingest"
    use_spark = featureset.spec.engine == "spark"
    if use_spark and not run_config.local and not spark_service:
        raise mlrun.errors.MLRunInvalidArgumentError(
            "Remote spark ingestion requires the spark service name to be provided"
        )

    default_kind = RuntimeKinds.remotespark if use_spark else RuntimeKinds.job
    spark_runtimes = [RuntimeKinds.remotespark]  # may support spark operator in future

    if not run_config.function:
        function_ref = featureset.spec.function.copy()
        if function_ref.is_empty():
            function_ref = FunctionReference(name=name, kind=default_kind)
        if not function_ref.url:
            function_ref.code = (function_ref.code or "") + _default_job_handler
        run_config.function = function_ref
        run_config.handler = "handler"

    image = None if use_spark else mlrun.mlconf.feature_store.default_job_image
    function = run_config.to_function(default_kind, image)
    if use_spark and function.kind not in spark_runtimes:
        raise mlrun.errors.MLRunInvalidArgumentError(
            "ingest with spark engine require spark function kind"
        )

    function.metadata.project = featureset.metadata.project
    function.metadata.name = function.metadata.name or name

    if not use_spark and not function.spec.image:
        raise mlrun.errors.MLRunInvalidArgumentError("function image must be specified")

    if use_spark and not run_config.local:
        function.with_spark_service(spark_service=spark_service)

    task = mlrun.new_task(
        name=name,
        params=run_config.parameters,
        handler=run_config.handler,
        out_path=featureset.spec.output_path,
    )
    task.spec.secret_sources = run_config.secret_sources
    task.set_label("job-type", "feature-ingest").set_label(
        "feature-set", featureset.uri
    )

    # set run UID and save in the feature set status (linking the features et to the job)
    task.metadata.uid = uuid.uuid4().hex
    featureset.status.run_uri = task.metadata.uid
    featureset.save()

    run = function.run(
        task, schedule=schedule, local=run_config.local, watch=run_config.watch
    )
    if run_config.watch:
        featureset.reload()
    return run
Esempio n. 6
0
    def custom_setup(self):
        self._logger.debug("Creating basics task")

        # {{run.uid}} will be substituted with the run id, so output will be written to different directories per run
        output_path = str(self.results_path / "{{run.uid}}")
        self._basics_task = (new_task(
            name="demo", params={
                "p1": 5
            }, artifact_path=output_path).with_secrets(
                "file",
                self.assets_path / "secrets.txt").set_label("type", "demo"))

        self._logger.debug("Creating inline task")
        self._inline_task = new_task(
            name="demo2",
            handler=self._get_inline_handler(),
            artifact_path=str(self.results_path / "{{run.uid}}"),
        )
Esempio n. 7
0
def test_vault_end_to_end():
    # This requires an MLRun API server to run and work with Vault. This port should
    # be configured to allow access to the server.
    api_server_port = 57764

    _set_vault_mlrun_configuration(api_server_port)
    project_name = "abc"
    func_name = "vault-function"
    aws_key_value = "1234567890"
    github_key_value = "proj1Key!!!"

    project = new_project(project_name)
    # This call will initialize Vault infrastructure and add the given secrets
    # It executes on the API server
    project.create_vault_secrets({
        "aws_key": aws_key_value,
        "github_key": github_key_value
    })

    # This API executes on the client side
    project_secrets = project.get_vault_secret_keys()
    assert project_secrets == ["aws_key", "github_key"], "secrets not created"

    # Create function and set container configuration
    function = code_to_function(
        name=func_name,
        filename="{}/vault_function.py".format(examples_path),
        handler="vault_func",
        project=project_name,
        kind="job",
    )

    function.spec.image = "saarcoiguazio/mlrun:unstable"

    # Create context for the execution
    spec = new_task(
        project=project_name,
        name="vault_test_run",
        handler="vault_func",
        out_path=out_path,
        params={"secrets": ["password", "path", "github_key", "aws_key"]},
    )
    spec.with_secrets("vault", [])

    result = function.run(spec)
    verify_state(result)

    db = get_run_db().connect()
    state, log = db.get_log(result.metadata.uid, project=project_name)
    log = str(log)
    print(state)

    assert (log.find("value: {}".format(aws_key_value)) !=
            -1), "secret value not detected in function output"
    assert (log.find("value: {}".format(github_key_value)) !=
            -1), "secret value not detected in function output"
def test_run_local():
    if Path(ARTIFACTS_PATH).is_dir():
        shutil.rmtree(ARTIFACTS_PATH)

    task = new_task(name="task-feature-selection",
                    handler = feature_selection,
                    params={'k': 2,
                           'min_votes': 0.3,
                           'label_column': 'is_error'},
                   inputs={'df_artifact': 'data/metrics.pq'},
                   )
    run_local(task=task,
              artifact_path=os.path.join(os.path.abspath('./'), 'artifacts'))
    _validate_paths({'feature_scores.parquet',
                     'selected_features.parquet'})
Esempio n. 9
0
def test_describe_dask_local():
    if Path(PLOTS_PATH).is_dir():
        shutil.rmtree(PLOTS_PATH)
    task = new_task(name="task-describe",
                    handler=summarize,
                    inputs={"table": DATA_URL},
                    params={
                        'update_dataset': True,
                        'label_column': 'label',
                        'dask_function': 'db://default/dask_tests'
                    })
    run_local(task)
    _validate_paths({
        'corr.html', 'correlation-matrix.csv', 'hist.html', 'imbalance.html',
        'imbalance-weights-vec.csv', 'violin.html'
    })
Esempio n. 10
0
def run_ingestion_job(name, featureset, run_config, schedule=None):
    name = name or f"{featureset.metadata.name}_ingest"

    if not run_config.function:
        function_ref = featureset.spec.function.copy()
        if function_ref.is_empty():
            function_ref = FunctionReference(name=name, kind=RuntimeKinds.job)
        if not function_ref.url:
            code = function_ref.code or ""
            if run_config.kind == RuntimeKinds.remotespark:
                function_ref.code = code + _default_spark_handler
            else:
                function_ref.code = code + _default_job_handler
        run_config.function = function_ref
        run_config.handler = "handler"

    image = (_default_spark_image()
             if run_config.kind == RuntimeKinds.remotespark else
             mlrun.mlconf.feature_store.default_job_image)
    function = run_config.to_function("job", image)
    function.metadata.project = featureset.metadata.project
    function.metadata.name = function.metadata.name or name

    if not function.spec.image:
        raise mlrun.errors.MLRunInvalidArgumentError(
            "function image must be specified")

    task = mlrun.new_task(name=name,
                          params=run_config.parameters,
                          handler=run_config.handler)
    task.spec.secret_sources = run_config.secret_sources
    task.set_label("job-type",
                   "feature-ingest").set_label("feature-set", featureset.uri)

    # set run UID and save in the feature set status (linking the features et to the job)
    task.metadata.uid = uuid.uuid4().hex
    featureset.status.run_uri = task.metadata.uid
    featureset.save()

    run = function.run(task,
                       schedule=schedule,
                       local=run_config.local,
                       watch=run_config.watch)
    if run_config.watch:
        featureset.reload()
    return run
Esempio n. 11
0
def test_run_local():
    if Path(PLOTS_PATH).is_dir():
        shutil.rmtree(PLOTS_PATH)
    task = new_task(
        name="task-describe",
        handler=summarize,
        inputs={"table": DATA_URL},
        params={
            "update_dataset": True,
            "label_column": "label"
        },
    )
    run_local(task)
    _validate_paths({
        "corr.html",
        "correlation-matrix.csv",
        "hist.html",
        "imbalance.html",
        "imbalance-weights-vec.csv",
        "violin.html",
    })
Esempio n. 12
0
def test_hyper_parallel_with_stop():
    list_params = '{"p2": [2,3,7,4,5], "p3": [10,10,10,10,10]}'
    mlrun.datastore.set_in_memory_item("params.json", list_params)

    run_spec = mlrun.new_task(params={"p1": 1})
    run_spec.with_hyper_params(
        {
            "p2": [2, 3, 7, 4, 5],
            "p3": [10, 10, 10, 10, 10]
        },
        parallel_runs=2,
        selector="max.r1",
        strategy="list",
        stop_condition="r1>=70",
    )
    run = new_function().run(run_spec, handler=hyper_func)

    verify_state(run)
    # result: r1 = p2 * p3, r1 >= 70 lead to stop on third run
    # may have one extra iterations in flight so checking both 4 or 5
    assert len(run.status.iterations) in [4, 5], "wrong number of iterations"
    assert run.output("best_iteration") == 3, "wrong best iteration"
Esempio n. 13
0
def test_azure_vault_end_to_end():
    mlconf.dbpath = f"http://localhost:{api_db_port}"

    project_name = "proj1"

    # Create function and set container configuration
    function = code_to_function(
        name="azure_vault_func",
        filename="vault_function.py",
        handler="vault_func",
        project=project_name,
        kind="job",
    )

    function.spec.image = "mlrun/mlrun:unstable"

    # Create context for the execution
    spec = new_task(
        project=project_name,
        name="azure_vault_test_run",
        handler="vault_func",
        out_path=out_path,
        params={"secrets": ["demo-key-1", "demo-key-2"]},
    )
    spec.with_secrets(
        "azure_vault",
        {
            "name": "saar-key-vault",
            "k8s_secret": azure_key_vault_k8s_secret,
            "secrets": [],
        },
    )

    result = function.run(spec)
    verify_state(result)

    db = get_run_db().connect()
    db.get_log(result.metadata.uid, project=project_name)
Esempio n. 14
0
    def custom_setup(self):
        self._logger.debug("Connecting to database")

        self._logger.debug("Creating dummy task for db queries")

        # {{run.uid}} will be substituted with the run id, so output will be written to different directories per run
        output_path = str(self.results_path / "{{run.uid}}")
        task = (new_task(name="demo",
                         params={
                             "p1": 5
                         },
                         artifact_path=output_path).with_secrets(
                             "file",
                             self.assets_path / "secrets.txt").set_label(
                                 "type", "demo"))

        self._logger.debug("Running dummy task")
        run_object = run_local(task,
                               command="training.py",
                               workdir=str(self.assets_path))
        self._logger.debug("Finished running dummy task",
                           run_object=run_object.to_dict())

        self._run_uid = run_object.uid()
Esempio n. 15
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import getpass
from os import path, environ

from mlrun import new_task, run_local, code_to_function
from tests.conftest import (
    examples_path,
    out_path,
    tag_test,
    verify_state,
)

base_spec = new_task(params={"p1": 8}, out_path=out_path)
base_spec.spec.inputs = {"infile.txt": "infile.txt"}


def test_run_local():
    spec = tag_test(base_spec, "test_run_local")
    result = run_local(spec,
                       command=f"{examples_path}/training.py",
                       workdir=examples_path)
    verify_state(result)


def test_run_local_with_uid_does_not_exist(monkeypatch):
    """
    Mocking a scenario that happened in field in which getuser raised the same error as the mock
    The problem was basically that the code was
Esempio n. 16
0
    def test_db_commands(self):
        self._logger.debug("Creating dummy task for db queries")

        # {{run.uid}} will be substituted with the run id, so output will be written to different directories per run
        output_path = str(self.results_path / "{{run.uid}}")
        task = (new_task(name="demo",
                         params={
                             "p1": 5
                         },
                         artifact_path=output_path).with_secrets(
                             "file",
                             self.assets_path / "secrets.txt").set_label(
                                 "type", "demo"))
        runs_count_before_run = len(
            self._run_db.list_runs(project=self.project_name))
        artifacts_count_before_run = len(
            self._run_db.list_artifacts(project=self.project_name, tag="*"))

        self._logger.debug("Running dummy task")
        run_object = run_local(task,
                               command="training.py",
                               workdir=str(self.assets_path))
        self._logger.debug("Finished running dummy task",
                           run_object=run_object.to_dict())

        self._run_uid = run_object.uid()

        runs = self._run_db.list_runs(project=self.project_name)
        assert len(runs) == runs_count_before_run + 1

        self._verify_run_metadata(
            runs[0]["metadata"],
            uid=self._run_uid,
            name="demo",
            project=self.project_name,
            labels={
                "kind": "",
                "framework": "sklearn"
            },
        )
        self._verify_run_spec(
            runs[0]["spec"],
            parameters={
                "p1": 5,
                "p2": "a-string"
            },
            inputs={"infile.txt": str(self.assets_path / "infile.txt")},
            outputs=[],
            output_path=str(self.results_path / self._run_uid),
            secret_sources=[],
            data_stores=[],
        )

        artifacts = self._run_db.list_artifacts(project=self.project_name,
                                                tag="*")
        assert len(artifacts) == artifacts_count_before_run + 4
        for artifact_key in ["chart", "html_result", "model", "mydf"]:
            artifact_exists = False
            for artifact in artifacts:
                if artifact["key"] == artifact_key:
                    artifact_exists = True
                    break
            assert artifact_exists

        runtimes = self._run_db.list_runtimes()
        assert len(runtimes) == len(
            mlrun.runtimes.RuntimeKinds.runtime_with_handlers())
        for runtime_kind in mlrun.runtimes.RuntimeKinds.runtime_with_handlers(
        ):
            runtime_exists = False
            for runtime in runtimes:
                if runtime["kind"] == runtime_kind:
                    runtime_exists = True
                    break
            assert runtime_exists
Esempio n. 17
0
def test_dask_local():
    spec = tag_test(new_task(params={"p1": 3, "p2": "vv"}), "test_dask_local")
    function = new_function(kind="dask")
    function.spec.remote = False
    run = function.run(spec, handler=my_func)
    verify_state(run)
Esempio n. 18
0
def run_function(
    function: Union[str, mlrun.runtimes.BaseRuntime],
    handler: str = None,
    name: str = "",
    params: dict = None,
    hyperparams: dict = None,
    hyper_param_options: mlrun.model.HyperParamOptions = None,
    inputs: dict = None,
    outputs: List[str] = None,
    workdir: str = "",
    labels: dict = None,
    base_task: mlrun.model.RunTemplate = None,
    watch: bool = True,
    local: bool = False,
    verbose: bool = None,
    project_object=None,
) -> Union[mlrun.model.RunObject, kfp.dsl.ContainerOp]:
    """Run a local or remote task as part of a local/kubeflow pipeline

    run_function() allow you to execute a function locally, on a remote cluster, or as part of an automated workflow
    function can be specified as an object or by name (str), when the function is specified by name it is looked up
    in the current project eliminating the need to redefine/edit functions.

    when functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level,
    e.g. local=True will run all the functions locally, setting artifact_path will direct all outputs to the same path.
    project runs provide additional notifications/reporting and exception handling.
    inside a Kubeflow pipeline (KFP) run_function() generates KFP "ContainerOps" which are used to form a DAG
    some behavior may differ between regular runs and deferred KFP runs.

    example (use with function object)::

        function = mlrun.import_function("hub://sklearn_classifier")
        run1 = run_function(function, params={"data": url})

    example (use with project)::

        # create a project with two functions (local and from marketplace)
        project = mlrun.new_project(project_name, "./proj)
        project.set_function("mycode.py", "myfunc", image="mlrun/mlrun")
        project.set_function("hub://sklearn_classifier", "train")

        # run functions (refer to them by name)
        run1 = run_function("myfunc", params={"x": 7})
        run2 = run_function("train", params={"data": run1.outputs["data"]})

    example (use in pipeline)::

        @dsl.pipeline(name="test pipeline", description="test")
        def my_pipe(url=""):
            run1 = run_function("loaddata", params={"url": url})
            run2 = run_function("train", params={"data": run1.outputs["data"]})

        project.run(workflow_handler=my_pipe, arguments={"param1": 7})

    :param function:        name of the function (in the project) or function object
    :param handler:         name of the function handler
    :param name:            execution name
    :param params:          input parameters (dict)
    :param hyperparams:     hyper parameters
    :param hyper_param_options:  hyper param options (selector, early stop, strategy, ..)
                            see: :py:class:`~mlrun.model.HyperParamOptions`
    :param inputs:          input objects (dict of key: path)
    :param outputs:         list of outputs which can pass in the workflow
    :param workdir:         default input artifacts path
    :param labels:          labels to tag the job/run with ({key:val, ..})
    :param base_task:       task object to use as base
    :param watch:           watch/follow run log, True by default
    :param local:           run the function locally vs on the runtime/cluster
    :param verbose:         add verbose prints/logs

    :return: MLRun RunObject or KubeFlow containerOp
    """
    engine, function = _get_engine_and_function(function, project_object)
    task = mlrun.new_task(
        name,
        handler=handler,
        params=params,
        hyper_params=hyperparams,
        hyper_param_options=hyper_param_options,
        inputs=inputs,
        base=base_task,
    )
    task.spec.verbose = task.spec.verbose or verbose

    if engine == "kfp":
        return function.as_step(
            runspec=task, workdir=workdir, outputs=outputs, labels=labels
        )
    else:
        if pipeline_context.workflow:
            local = local or pipeline_context.workflow.run_local
        task.metadata.labels = task.metadata.labels or labels or {}
        task.metadata.labels["workflow"] = pipeline_context.workflow_id
        run_result = function.run(
            runspec=task,
            workdir=workdir,
            verbose=verbose,
            watch=watch,
            local=local,
            artifact_path=pipeline_context.workflow_artifact_path,
        )
        if run_result:
            run_result._notified = False
            pipeline_context.runs_map[run_result.uid()] = run_result
            run_result.after = (
                lambda x: run_result
            )  # emulate KFP op, .after() will be ignored
        return run_result
Esempio n. 19
0
def run_ingestion_job(
    featureset: Union[FeatureSet, str],
    source: DataSource = None,
    targets: List[DataTargetBase] = None,
    name: str = None,
    infer_options: InferOptions = InferOptions.default(),
    parameters: Dict[str, Union[str, list, dict]] = None,
    function=None,
    local=False,
    watch=True,
    auto_mount=False,
    engine=None,
    secrets=None,
    handler=None,
):
    """Start batch ingestion task using remote MLRun job or spark function

    Deploy and run batch job implementing feature ingestion pipeline
    sources will deploy mlrun python or spark jobs (use the `engine` attribute to select spark),
    for scheduled jobs set the schedule attribute in the offline source.

    example::

        source = CSVSource("mycsv", path="measurements.csv")
        targets = [CSVTarget("mycsv", path="./mycsv.csv")]
        run_ingestion_job(measurements, source, targets, name="tst_ingest")

    :param featureset:    feature set object or uri
    :param source:        data source object describing the online or offline source
    :param targets:       list of data target objects
    :param name:          name name for the job/function
    :param infer_options: schema and stats infer options
    :param parameters:    extra parameter dictionary which is passed to the graph context
    :param function:      custom ingestion function
    :param local:         run local emulation using mock_server() or run_local()
    :param watch:         wait for job completion, set to False if you dont want to wait
    :param auto_mount:    add PVC or v3io volume to the function (using mlrun.platform.auto_mount)
    :param engine:        ingestion engine, set to "spark" for using Spark
    :param secrets:       key/value dictionary for secrets (for data credential vars)
    :param handler:       run specific handler/method in the function
    """
    if isinstance(featureset, str):
        featureset = get_feature_set_by_uri(featureset)

    source, parameters = set_task_params(featureset, source, targets,
                                         parameters, infer_options)

    name = name or f"{featureset.metadata.name}_ingest"
    function = default_ingestion_job_function(name, featureset, engine,
                                              function)
    if auto_mount:
        function.apply(mlrun.platforms.auto_mount())

    function.metadata.project = featureset.metadata.project

    task = mlrun.new_task(name=name, params=parameters, handler=handler)
    if secrets:
        task.with_secrets("inline", secrets)  # todo: replace with vault

    # set run UID and save in the feature set status (linking the features et to the job)
    task.metadata.uid = uuid.uuid4().hex
    featureset.status.run_uri = task.metadata.uid
    featureset.save()

    run = function.run(task,
                       schedule=source.schedule,
                       local=local,
                       watch=watch)
    if watch:
        featureset.reload()
    return run
Esempio n. 20
0
        "first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"],
        "last_name": ["Miller", "Jacobson", "Ali", "Milner", "Cooze"],
        "x": np.array([1, 2, 3.2, np.nan, 5.5]),
        "y": [25, 94, 0.1, 57, datetime.datetime(2018, 1, 1)],
    }
    df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "x", "y"])
    context.log_dataset("df1", df=df, format="csv")

    date_rng = pd.date_range("2018-01-01", periods=4, freq="H")
    df = pd.DataFrame(date_rng, columns=["date"])
    df["data"] = np.random.rand(4)
    df["nan"] = np.nan
    df["datetime"] = pd.to_datetime(df["date"])
    df["text"] = "x"
    df = df.set_index("datetime")
    context.log_dataset("df2", df=df)

    return np.nan


base_spec = new_task(artifact_path=out_path, handler=my_func)


def test_serialization():
    spec = tag_test(base_spec, "test_serialization")
    result = run_local(spec)
    verify_state(result)
    pprint(result.to_dict())
    print(result.to_yaml())
    pprint(result.to_json())