Esempio n. 1
0
def dbnd_on_pre_init_context(ctx):
    from mlflow import get_tracking_uri, set_tracking_uri

    if not config.getboolean("mlflow_tracking", "databand_tracking"):
        return

    databand_url = config.get("core", "databand_url")
    if not databand_url:
        logger.info(
            "Although 'databand_tracking' was set in 'mlflow_tracking', "
            "dbnd will not use it since 'core.databand_url' was not set."
        )
        return

    duplicate_tracking_to = config.get("mlflow_tracking", "duplicate_tracking_to")

    if not duplicate_tracking_to:
        duplicate_tracking_to = get_tracking_uri()

        # check if dbnd store uri was already defined with MLFlow config
        if is_composite_uri(duplicate_tracking_to):
            raise DatabandConfigError(
                "Config conflict: MLFlow and DBND configs both define dbnd store uri"
            )

    composite_uri = build_composite_uri(databand_url, duplicate_tracking_to)

    global _original_mlflow_tracking_uri
    _original_mlflow_tracking_uri = get_tracking_uri()
    set_tracking_uri(composite_uri)
Esempio n. 2
0
    def __init__(self, _tracking_uri=get_pod_uri("mlflow", "5001")):
        mlflow.set_tracking_uri(_tracking_uri)
        print("Tracking Model Metadata on MLFlow Server @ " +
              mlflow.get_tracking_uri())

        if not mlflow.get_tracking_uri() == _tracking_uri:
            Warning(
                "MLManager doesn't seem to be communicating with the right server endpoint."
                "Try instantiating this class again!")

        MlflowClient.__init__(self, _tracking_uri)
        self.active_run = None
        self.active_experiment = None
Esempio n. 3
0
    def test_create_new_or_continue_experiment(self, with_uri, name_from_meta):
        temp_dir = self.get_temp_dir()
        projects_dir = os.path.join(temp_dir, "projects_dir")
        os.mkdir(projects_dir)
        project_dir = os.path.join(projects_dir, "experiment_test")
        uri_dir = os.path.join(temp_dir, "env_uri")
        if name_from_meta:
            os.mkdir(project_dir)
            meta_fname = os.path.join(project_dir, "nucleus7_project.json")
            with open(meta_fname, "w") as f:
                json.dump({"PROJECT_NAME": "experiment_test_meta"}, f)

        if with_uri:
            os.environ["MLFLOW_TRACKING_URI"] = uri_dir
        mlflow_utils.create_new_or_continue_experiment(project_dir)
        if not mlflow.active_run():
            with mlflow.start_run():
                experiment_id = mlflow.active_run().info.experiment_id
        else:
            experiment_id = mlflow.active_run().info.experiment_id

        experiment_name = (
            mlflow.tracking.MlflowClient().get_experiment(experiment_id).name)
        experiment_name_must = (name_from_meta and "experiment_test_meta"
                                or "experiment_test")
        uri_must = with_uri and uri_dir or os.path.join(projects_dir, 'mlruns')
        self.assertEqual(uri_must,
                         mlflow.get_tracking_uri())
        self.assertEqual(experiment_name_must,
                         experiment_name)
Esempio n. 4
0
def test_kedro_mlflow_config_setup_tracking_priority(mocker, tmp_path,
                                                     config_dir):
    """Test if the mlflow_tracking uri set is the one of mlflow.yml
    if it also eist in credentials.

    Args:
        mocker ([type]): [description]
        tmp_path ([type]): [description]
        config_dir ([type]): [description]
    """
    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
    mocker.patch("kedro_mlflow.utils._is_kedro_project", lambda x: True)

    (tmp_path / "conf/base/credentials.yml").write_text(
        yaml.dump(dict(my_mlflow_creds=dict(mlflow_tracking_uri="mlruns2"))))

    config = KedroMlflowConfig(
        project_path=tmp_path,
        mlflow_tracking_uri="mlruns1",
        credentials="my_mlflow_creds",
    )
    context = load_context(tmp_path)
    config.setup(context)

    assert mlflow.get_tracking_uri() == (tmp_path / "mlruns1").as_uri()
Esempio n. 5
0
def process_log_func(exp_name):
    import mlflow
    import random

    mlflow.set_experiment(exp_name)

    uri = mlflow.get_tracking_uri()
    mlflow.set_tracking_uri(uri)

    my_pid = os.getpid()
    parent_pid = os.getppid()
    artifact_path = "/tmp/features-" + str(my_pid) + ".txt"
    # create an artifact
    features = "rooms, zipcode, median_price, school_rating, transport"
    with open(artifact_path, 'w') as f:
        f.write(features)

    print("Running in PID: {}".format(my_pid))
    with mlflow.start_run():
        pid = "PID-" + str(my_pid)
        ppid = "PPID-" + str(parent_pid)
        mlflow.log_param(pid, my_pid)
        mlflow.log_param(ppid, parent_pid)
        mlflow.log_metric("metric", random.randint(1, 10))
        mlflow.log_artifact(artifact_path)

    experiment = mlflow.get_experiment_by_name(exp_name)
    if experiment:
        print("In pid: {}, experiment_id: {}".format(my_pid,
                                                     experiment.experiment_id))
Esempio n. 6
0
def train_model(X_train: np.ndarray, y_train: np.ndarray) -> LinearRegression:
    """Train the linear regression model.

        Args:
            X_train: Training data of independent features.
            y_train: Training data for price.

        Returns:
            Trained model.

    """

    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":

        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(regressor,
                                 "model",
                                 registered_model_name="regressor")
    else:
        mlflow.sklearn.log_model(regressor, "model")

    return regressor
Esempio n. 7
0
def _run_entry_point(command, work_dir, experiment_id, run_id):
    """
    Run an entry point command in a subprocess, returning a SubmittedRun that can be used to
    query the run's status.
    :param command: Entry point command to run
    :param work_dir: Working directory in which to run the command
    :param run_id: MLflow run ID associated with the entry point execution.
    """
    env = os.environ.copy()
    env.update(get_run_env_vars(run_id, experiment_id))
    env.update(get_databricks_env_vars(tracking_uri=mlflow.get_tracking_uri()))
    _logger.info("=== Running command '%s' in run with ID '%s' === ", command,
                 run_id)
    # in case os name is not 'nt', we are not running on windows. It introduces
    # bash command otherwise.
    if os.name != "nt":
        process = subprocess.Popen(["bash", "-c", command],
                                   close_fds=True,
                                   cwd=work_dir,
                                   env=env)
    else:
        # process = subprocess.Popen(command, close_fds=True, cwd=work_dir, env=env)
        process = subprocess.Popen(["cmd", "/c", command],
                                   close_fds=True,
                                   cwd=work_dir,
                                   env=env)
    return LocalSubmittedRun(run_id, process)
def test_kedro_mlflow_config_setup_tracking_priority(kedro_project_with_mlflow_conf):
    """Test if the mlflow_tracking uri set is the one of mlflow.yml
    if it also exist in credentials.
    """
    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project

    (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text(
        yaml.dump(dict(my_mlflow_creds=dict(mlflow_tracking_uri="mlruns2")))
    )

    config = KedroMlflowConfig(
        server=dict(
            mlflow_tracking_uri="mlruns1",
            credentials="my_mlflow_creds",
        ),
    )

    bootstrap_project(kedro_project_with_mlflow_conf)
    with KedroSession.create(project_path=kedro_project_with_mlflow_conf) as session:
        context = session.load_context()
        config.setup(context)

    assert (
        mlflow.get_tracking_uri()
        == (kedro_project_with_mlflow_conf / "mlruns1").as_uri()
    )

    # reset folder to avoid interference with other tests
    (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text("")
Esempio n. 9
0
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator,
                                     should_start_run, use_dfs_tmpdir):
    old_tracking_uri = mlflow.get_tracking_uri()
    if use_dfs_tmpdir:
        dfs_tmpdir = None
    else:
        dfs_tmpdir = tmpdir.join("test").strpath

    try:
        tracking_dir = os.path.abspath(str(tmpdir.join("mlruns")))
        mlflow.set_tracking_uri("file://%s" % tracking_dir)
        if should_start_run:
            mlflow.start_run()
        artifact_path = "model"
        sparkm.log_model(
            artifact_path=artifact_path,
            spark_model=spark_model_estimator.model,
            dfs_tmpdir=dfs_tmpdir,
        )
        model_uri = "runs:/{run_id}/{artifact_path}".format(
            run_id=mlflow.active_run().info.run_id,
            artifact_path=artifact_path)

        reloaded_model = sparkm.load_model(model_uri=model_uri,
                                           dfs_tmpdir=dfs_tmpdir)
        preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
        preds = [x.prediction for x in preds_df.select("prediction").collect()]
        assert spark_model_estimator.predictions == preds
    finally:
        mlflow.end_run()
        mlflow.set_tracking_uri(old_tracking_uri)
Esempio n. 10
0
def train_model(model, X_train, y_train, name, config):
    """train
    train a single model.

    # Arguments
        model: Model, NN model to train.
        X_train: ndarray(number, lags), Input data for train.
        y_train: ndarray(number, ), result data for train.
        name: String, name of model.
        config: Dict, parameter for train.
    """
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    tracking_uri = mlflow.get_tracking_uri()
    print("Current tracking uri: {}".format(tracking_uri))

    tags = {"usuario": "Anonymous"}

    mlflow.set_experiment("traffic_flow-saes")
    with mlflow.start_run() as run:
        mlflow.set_tags(tags)
        mlflow.keras.autolog()

        model.compile(loss="mse", optimizer="rmsprop", metrics=['mape'])
        #early = EarlyStopping(monitor='val_loss', patience=30, verbose=0, mode='auto')
        hist = model.fit(X_train,
                         y_train,
                         batch_size=config["batch"],
                         epochs=config["epochs"],
                         validation_split=0.05)

        model.save('model/' + name + '.h5')
        df = pd.DataFrame.from_dict(hist.history)
        df.to_csv('model/' + name + ' loss.csv', encoding='utf-8', index=False)
        mlflow.log_param("Run_id", run.info.run_id)
Esempio n. 11
0
def test_model_log(prophet_model):
    old_uri = mlflow.get_tracking_uri()
    with TempDir(chdr=True, remove_on_exit=True) as tmp:
        for should_start_run in [False, True]:
            try:
                mlflow.set_tracking_uri("test")
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "prophet"
                conda_env = os.path.join(tmp.path(), "conda_env.yaml")
                _mlflow_conda_env(conda_env, additional_pip_deps=["pystan", "prophet"])

                model_info = mlflow.prophet.log_model(
                    pr_model=prophet_model.model, artifact_path=artifact_path, conda_env=conda_env
                )
                model_uri = f"runs:/{mlflow.active_run().info.run_id}/{artifact_path}"
                assert model_info.model_uri == model_uri
                reloaded_prophet_model = mlflow.prophet.load_model(model_uri=model_uri)

                np.testing.assert_array_equal(
                    generate_forecast(prophet_model.model, FORECAST_HORIZON),
                    generate_forecast(reloaded_prophet_model, FORECAST_HORIZON),
                )

                model_path = _download_artifact_from_uri(artifact_uri=model_uri)
                model_config = Model.load(os.path.join(model_path, "MLmodel"))
                assert pyfunc.FLAVOR_NAME in model_config.flavors
                assert pyfunc.ENV in model_config.flavors[pyfunc.FLAVOR_NAME]
                env_path = model_config.flavors[pyfunc.FLAVOR_NAME][pyfunc.ENV]
                assert os.path.exists(os.path.join(model_path, env_path))

            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_uri)
Esempio n. 12
0
def mlflow_tracking_in_task_example(check_time=datetime.datetime.now()):
    # type: ( datetime.datetime)-> str
    logger.info("Running MLFlow tracking integration check!")
    logger.info("MLFlow tracking URI: {}".format(get_tracking_uri()))

    start_run()

    # params
    log_param("param1", randint(0, 100))
    log_param("param2", randint(0, 100))

    # metrics
    log_metric("foo1", random())
    log_metric("foo1", random() + 1)
    log_metric("foo2", random())
    log_metric("foo2", random() + 1)

    # artifacts
    if not os.path.exists("outputs"):
        os.makedirs("outputs")
    with open("outputs/test1.txt", "w") as f1, open("outputs/test2.txt",
                                                    "w") as f2:
        f1.write("hello")
        f2.write("world!")
    log_artifacts("outputs")

    # Get run metadata & data from the tracking server
    service = MlflowClient()
    run_id = active_run().info.run_id
    run = service.get_run(run_id)
    logger.info("Metadata & data for run with UUID %s: %s" % (run_id, run))

    end_run()

    logger.info("MLFlow tracking integration check completed!")
Esempio n. 13
0
def test_model_log(h2o_iris_model):
    h2o_model = h2o_iris_model.model
    old_uri = mlflow.get_tracking_uri()
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        with TempDir(chdr=True, remove_on_exit=True):
            try:
                artifact_path = "gbm_model"
                mlflow.set_tracking_uri("test")
                if should_start_run:
                    mlflow.start_run()
                mlflow.h2o.log_model(h2o_model=h2o_model,
                                     artifact_path=artifact_path)
                model_uri = "runs:/{run_id}/{artifact_path}".format(
                    run_id=mlflow.active_run().info.run_id,
                    artifact_path=artifact_path)

                # Load model
                h2o_model_loaded = mlflow.h2o.load_model(model_uri=model_uri)
                assert all(
                    h2o_model_loaded.predict(h2o_iris_model.inference_data).
                    as_data_frame() == h2o_model.predict(
                        h2o_iris_model.inference_data).as_data_frame())
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_uri)
Esempio n. 14
0
def test_kedro_mlflow_config_setup_tracking_priority(kedro_project_with_mlflow_conf):
    """Test if the mlflow_tracking uri set is the one of mlflow.yml
    if it also eist in credentials.

    Args:
        mocker ([type]): [description]
        tmp_path ([type]): [description]
    """
    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project

    (kedro_project_with_mlflow_conf / "conf/base/credentials.yml").write_text(
        yaml.dump(dict(my_mlflow_creds=dict(mlflow_tracking_uri="mlruns2")))
    )

    config = KedroMlflowConfig(
        project_path=kedro_project_with_mlflow_conf,
        mlflow_tracking_uri="mlruns1",
        credentials="my_mlflow_creds",
    )

    project_metadata = _get_project_metadata(kedro_project_with_mlflow_conf)
    _add_src_to_path(project_metadata.source_dir, kedro_project_with_mlflow_conf)
    configure_project(project_metadata.package_name)
    with KedroSession.create(
        "fake_project", project_path=kedro_project_with_mlflow_conf
    ):
        config.setup()

    assert (
        mlflow.get_tracking_uri()
        == (kedro_project_with_mlflow_conf / "mlruns1").as_uri()
    )
Esempio n. 15
0
def test_docker_project_tracking_uri_propagation(ProfileConfigProvider, tmpdir,
                                                 tracking_uri,
                                                 expected_command_segment,
                                                 docker_example_base_image):  # pylint: disable=unused-argument
    mock_provider = mock.MagicMock()
    mock_provider.get_config.return_value = DatabricksConfig("host",
                                                             "user",
                                                             "pass",
                                                             None,
                                                             insecure=True)
    ProfileConfigProvider.return_value = mock_provider
    # Create and mock local tracking directory
    local_tracking_dir = os.path.join(tmpdir.strpath, "mlruns")
    if tracking_uri is None:
        tracking_uri = local_tracking_dir
    old_uri = mlflow.get_tracking_uri()
    try:
        mlflow.set_tracking_uri(tracking_uri)
        with mock.patch("mlflow.tracking._tracking_service.utils._get_store"
                        ) as _get_store_mock:
            _get_store_mock.return_value = file_store.FileStore(
                local_tracking_dir)
            mlflow.projects.run(
                TEST_DOCKER_PROJECT_DIR,
                experiment_id=file_store.FileStore.DEFAULT_EXPERIMENT_ID)
    finally:
        mlflow.set_tracking_uri(old_uri)
Esempio n. 16
0
def set_tracking_uri():
    # This does not affect the currently active run (if one exists),
    # but takes effect for successive runs.
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    # Check
    tracking_uri = mlflow.get_tracking_uri()
    print(f"\nCurrent tracking uri: {tracking_uri}")
Esempio n. 17
0
def train_workflow(num_topics):
    print(mlflow.get_tracking_uri())

    mlflow.create_experiment(name="topic_modeling_news_topics_5")

    with mlflow.start_run():
        lda.train(num_topics=num_topics)
Esempio n. 18
0
def test_sparkml_estimator_model_log(tmpdir, spark_model_estimator):
    # Print the coefficients and intercept for multinomial logistic regression
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.join("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(
                    artifact_path=artifact_path,
                    spark_model=spark_model_estimator.model,
                    dfs_tmpdir=dfs_tmp_dir,
                )
                model_uri = "runs:/{run_id}/{artifact_path}".format(
                    run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path
                )

                # test reloaded model
                reloaded_model = sparkm.load_model(model_uri=model_uri, dfs_tmpdir=dfs_tmp_dir)
                preds_df = reloaded_model.transform(spark_model_estimator.spark_df)
                preds = [x.prediction for x in preds_df.select("prediction").collect()]
                assert spark_model_estimator.predictions == preds
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                x = dfs_tmp_dir or sparkm.DFS_TMP
                shutil.rmtree(x)
                shutil.rmtree(tracking_dir)
 def test_model_log(self):
     old_uri = mlflow.get_tracking_uri()
     # should_start_run tests whether or not calling log_model() automatically starts a run.
     for should_start_run in [False, True]:
         with TempDir(chdr=True, remove_on_exit=True) as tmp:
             try:
                 mlflow.set_tracking_uri("test")
                 if should_start_run:
                     mlflow.start_run()
                 artifact_path = "linear"
                 conda_env = os.path.join(tmp.path(), "conda_env.yaml")
                 _mlflow_conda_env(conda_env, additional_pip_deps=["sklearn"])
                 sklearn.log_model(sk_model=self._linear_lr,
                                   artifact_path=artifact_path,
                                   conda_env=conda_env)
                 x = sklearn.load_model(artifact_path, run_id=mlflow.active_run().info.run_uuid)
                 model_path = _get_model_log_dir(
                         artifact_path, mlflow.active_run().info.run_uuid)
                 model_config = Model.load(os.path.join(model_path, "MLmodel"))
                 assert pyfunc.FLAVOR_NAME in model_config.flavors
                 assert pyfunc.ENV in model_config.flavors[pyfunc.FLAVOR_NAME]
                 env_path = model_config.flavors[pyfunc.FLAVOR_NAME][pyfunc.ENV]
                 assert os.path.exists(os.path.join(model_path, env_path))
                 xpred = x.predict(self._X)
                 np.testing.assert_array_equal(self._linear_lr_predict, xpred)
             finally:
                 mlflow.end_run()
                 mlflow.set_tracking_uri(old_uri)
Esempio n. 20
0
 def _ensure_tracking_uri(self, meta, tracking_uri=None):
     tracking_uri = (tracking_uri or mlflow.get_tracking_uri() or meta.kind_meta.get('tracking_uri') or
                     os.environ.get('MLFLOW_TRACKING_URI'))
     meta.kind_meta['tracking_uri'] = tracking_uri
     assert tracking_uri, "pass tracking_uri= or set kind_meta.tracking_uri or set env MLFLOW_TRACKING_URI"
     mlflow.tracking.MlflowClient(tracking_uri).list_registered_models(max_results=1)
     mlflow.set_tracking_uri(tracking_uri)
Esempio n. 21
0
def tune_fn():
    mlflow.set_experiment(experiment_name=experiment_name)

    optuna_search = OptunaSearch(metric="auroc", mode="max")

    ax_search = AxSearch(metric="auroc", mode="max")

    tune.run(objective,
             name="mlflow_gbdt",
             num_samples=65,
             config={
                 "num_leaves": tune.randint(5, 95),
                 "learning_rate": tune.loguniform(1e-4, 1.0),
                 "n_estimators": tune.randint(100, 100000),
                 "subsample": tune.loguniform(0.01, 1.0),
                 "subsample_freq": tune.randint(1, 5),
                 "objective": "binary",
                 "reg_alpha": tune.loguniform(1e-4, 1.0),
                 "reg_lambda": tune.loguniform(1e-4, 1.0),
                 "tree_learner": "feature",
                 "feature_sel": 0,
                 "mlflow": {
                     "experiment_name": experiment_name,
                     "tracking_uri": mlflow.get_tracking_uri()
                 }
             },
             search_alg=optuna_search)
Esempio n. 22
0
def test_tune_exp_default_trainable(
    dataset, pipeline_config, trainer_config, monkeypatch
):
    # avoid logging to wandb
    monkeypatch.setenv("WANDB_MODE", "dryrun")

    pipeline_config["features"]["word"]["embedding_dim"] = tune.choice([2, 4])
    trainer_config["optimizer"]["lr"] = tune.loguniform(0.001, 0.01)

    my_exp = TuneExperiment(
        pipeline_config=pipeline_config,
        trainer_config=trainer_config,
        train_dataset=dataset,
        valid_dataset=dataset,
        num_samples=1,
    )

    assert my_exp._name.startswith("HPO on")
    assert my_exp.name == my_exp._name
    assert my_exp._run_identifier == "_default_trainable"

    analysis = tune.run(my_exp)
    assert len(analysis.trials) == 1

    mlflow.set_tracking_uri(mlflow.get_tracking_uri())
    assert mlflow.get_experiment_by_name(my_exp._name)
Esempio n. 23
0
def get_store(uri=None):
    from mlflow import get_tracking_uri
    from mlflow.store.file_store import FileStore

    if uri is None:
        uri = get_tracking_uri()

    return FileStore(uri)
Esempio n. 24
0
    def setUp(self):
        self.dirpath = tempfile.mkdtemp()
        import mlflow
        mlflow.set_tracking_uri(self.dirpath)
        mlflow.create_experiment(name="existing_experiment")

        self.mlflow_util = MLflowLoggerUtil()
        self.tracking_uri = mlflow.get_tracking_uri()
Esempio n. 25
0
def test_model_log(tmpdir):
    conda_env = os.path.join(str(tmpdir), "conda_env.yml")
    _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)])
    iris = datasets.load_iris()
    feature_names = ["0", "1", "2", "3"]
    pandas_df = pd.DataFrame(iris.data, columns=feature_names)  # to make spark_udf work
    pandas_df['label'] = pd.Series(iris.target)
    spark_session = pyspark.sql.SparkSession.builder \
        .config(key="spark_session.python.worker.reuse", value=True) \
        .master("local-cluster[2, 1, 1024]") \
        .getOrCreate()
    spark_df = spark_session.createDataFrame(pandas_df)
    assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
    lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
    pipeline = Pipeline(stages=[assembler, lr])
    # Fit the model
    model = pipeline.fit(spark_df)
    # Print the coefficients and intercept for multinomial logistic regression
    preds_df = model.transform(spark_df)
    preds1 = [x.prediction for x in preds_df.select("prediction").collect()]
    old_tracking_uri = mlflow.get_tracking_uri()
    cnt = 0
    # should_start_run tests whether or not calling log_model() automatically starts a run.
    for should_start_run in [False, True]:
        for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]:
            print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir)
            try:
                tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns")))
                mlflow.set_tracking_uri("file://%s" % tracking_dir)
                if should_start_run:
                    mlflow.start_run()
                artifact_path = "model%d" % cnt
                cnt += 1
                sparkm.log_model(artifact_path=artifact_path, spark_model=model,
                                 dfs_tmpdir=dfs_tmp_dir)
                run_id = active_run().info.run_uuid
                # test pyfunc
                x = pyfunc.load_pyfunc(artifact_path, run_id=run_id)
                preds2 = x.predict(pandas_df)
                assert preds1 == preds2
                # test load model
                reloaded_model = sparkm.load_model(artifact_path, run_id=run_id,
                                                   dfs_tmpdir=dfs_tmp_dir)
                preds_df_1 = reloaded_model.transform(spark_df)
                preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()]
                assert preds1 == preds3
                # test spar_udf
                preds4 = score_model_as_udf(artifact_path, run_id, pandas_df)
                assert preds1 == preds4
                # We expect not to delete the DFS tempdir.
                x = dfs_tmp_dir or sparkm.DFS_TMP
                assert os.path.exists(x)
                assert os.listdir(x)
                shutil.rmtree(x)
            finally:
                mlflow.end_run()
                mlflow.set_tracking_uri(old_tracking_uri)
                shutil.rmtree(tracking_dir)
Esempio n. 26
0
def run_test():
    for l1, alpha in itertools.product([0.75, 1], [0, 0.5]):
        # with mlflow.start_run(run_id='91878d6666994fa8b7205f4a83171e8a', experiment_id=2, run_name='ipython'):
        with mlflow.start_run(run_id='91878d6666994fa8b7205f4a83171e8a',
                              experiment_id=2,
                              run_name='ipython'):
            parameters = {
                'l1': str(l1),
                'alpha': str(alpha),
            }
            # metrics = {
            #     'MAE': [rand()],
            #     'R2': [rand()],
            #     'RMSE': [rand()],
            # }
            mlflow.log_params(parameters)
            mlflow.get_artifact_uri()
            mlflow.get_tracking_uri()
Esempio n. 27
0
def create_common_manifest(duration):
    return {
        "info": {
            "mlflow_version": mlflow.__version__,
            "mlflow_tracking_uri": mlflow.get_tracking_uri(),
            "export_time": get_now_nice(),
            "duration": duration
        }
    }
Esempio n. 28
0
def get_db_schema():
    engine = sqlalchemy.create_engine(mlflow.get_tracking_uri())
    created_tables_metadata = MetaData(bind=engine)
    created_tables_metadata.reflect()
    # Write out table schema as described in
    # https://docs.sqlalchemy.org/en/13/faq/metadata_schema.html#how-can-i-get-the-create-table-drop-table-output-as-a-string
    lines = []
    for ti in created_tables_metadata.sorted_tables:
        lines += list(map(str.rstrip, str(CreateTable(ti)).splitlines()))
    return "\n".join(lines)
Esempio n. 29
0
    def test_log_saved_model(self):
        # This tests model logging capabilities on the sklearn.iris dataset.
        iris = datasets.load_iris()
        X = iris.data[:, :2]  # we only take the first two features.
        y = iris.target
        trainingFeatures = {}
        for i in range(0, 2):
            # TensorFlow is fickle about feature names, so we remove offending characters
            iris.feature_names[i] = iris.feature_names[i].replace(" ", "")
            iris.feature_names[i] = iris.feature_names[i].replace("(", "")
            iris.feature_names[i] = iris.feature_names[i].replace(")", "")
            trainingFeatures[iris.feature_names[i]] = iris.data[:, i:i+1]
        tf_feat_cols = []
        feature_names = iris.feature_names[:2]
        # Creating TensorFlow-specific numeric columns for input.
        for col in iris.feature_names[:2]:
            tf_feat_cols.append(tf.feature_column.numeric_column(col))
        # Creating input training function.
        input_train = tf.estimator.inputs.numpy_input_fn(trainingFeatures,
                                                         y,
                                                         shuffle=False,
                                                         batch_size=1)
        # Creating Deep Neural Network Regressor.
        estimator = tf.estimator.DNNRegressor(feature_columns=tf_feat_cols,
                                              hidden_units=[1])
        # Training and creating expected predictions on training dataset.
        estimator.train(input_train, steps=10)
        # Saving the estimator's prediction on the training data; assume the DNNRegressor
        # produces a single output column named 'predictions'
        pred_col = "predictions"
        estimator_preds = [s[pred_col] for s in estimator.predict(input_train)]
        estimator_preds_df = pd.DataFrame({pred_col: estimator_preds})

        old_tracking_uri = mlflow.get_tracking_uri()
        # should_start_run tests whether or not calling log_model() automatically starts a run.
        for should_start_run in [False, True]:
            with TempDir(chdr=True, remove_on_exit=True) as tmp:
                try:
                    # Creating dict of features names (str) to placeholders (tensors)
                    feature_spec = {}
                    for name in feature_names:
                        feature_spec[name] = tf.placeholder("float", name=name, shape=[150])
                    mlflow.set_tracking_uri("test")
                    if should_start_run:
                        mlflow.start_run()
                    pyfunc_preds_df = self.helper(feature_spec, tmp, estimator,
                                                  pandas.DataFrame(data=X, columns=feature_names))

                    # Asserting that the loaded model predictions are as expected.
                    assert estimator_preds_df.equals(pyfunc_preds_df)
                finally:
                    # Restoring the old logging location.
                    mlflow.end_run()
                    mlflow.set_tracking_uri(old_tracking_uri)
Esempio n. 30
0
 def config(self) -> dict:
     """The config dictionary used by the `TuneExperiment.trainable` function"""
     return {
         "pipeline_config": self._pipeline_config,
         "trainer_config": self._trainer_config,
         "train_dataset_path": self._train_dataset_path,
         "valid_dataset_path": self._valid_dataset_path,
         "mlflow_tracking_uri": mlflow.get_tracking_uri(),
         "vocab_path": self._vocab_path,
         "name": self._name,
     }