def test_pipeline(dataset_text):
    mlflow.pyspark.ml.autolog()

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    inner_pipeline = Pipeline(stages=[hashingTF, lr])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    for estimator in [pipeline, nested_pipeline]:
        with mlflow.start_run() as run:
            model = estimator.fit(dataset_text)
            estimator_info = load_json_artifact("estimator_info.json")
            metadata = _gen_estimator_metadata(estimator)
            assert metadata.hierarchy == estimator_info["hierarchy"]

        uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
        run_id = run.info.run_id
        run_data = get_run_data(run_id)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values(
                _get_instance_param_map(estimator, uid_to_indexed_name_map)))
        assert run_data.tags == get_expected_class_tags(estimator)
        assert MODEL_DIR in run_data.artifacts
        loaded_model = load_model_by_run_id(run_id)
        assert loaded_model.uid == model.uid
        assert run_data.artifacts == ["estimator_info.json", "model"]
def test_pipeline(dataset_text):
    mlflow.pyspark.ml.autolog()

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2, regParam=0.001)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    inner_pipeline = Pipeline(stages=[hashingTF, lr])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    assert _get_pipeline_stage_hierarchy(pipeline) == {
        pipeline.uid: [tokenizer.uid, hashingTF.uid, lr.uid]
    }
    assert _get_pipeline_stage_hierarchy(nested_pipeline) == {
        nested_pipeline.uid:
        [tokenizer.uid, {
            inner_pipeline.uid: [hashingTF.uid, lr.uid]
        }]
    }

    for estimator in [pipeline, nested_pipeline]:
        with mlflow.start_run() as run:
            model = estimator.fit(dataset_text)

        run_id = run.info.run_id
        run_data = get_run_data(run_id)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values(_get_instance_param_map(estimator)))
        assert run_data.tags == get_expected_class_tags(estimator)
        assert MODEL_DIR in run_data.artifacts
        loaded_model = load_model_by_run_id(run_id)
        assert loaded_model.uid == model.uid
        assert run_data.artifacts == ["model", "pipeline_hierarchy.json"]
def test_param_map_captures_wrapped_params(dataset_binomial):
    lor = LogisticRegression(maxIter=3, standardization=False)
    ova = OneVsRest(classifier=lor, labelCol="abcd")

    param_map = _get_instance_param_map(ova)
    assert param_map["labelCol"] == "abcd"
    assert param_map["classifier"] == lor.uid
    assert param_map[f"{lor.uid}.maxIter"] == 3
    assert not param_map[f"{lor.uid}.standardization"]
    assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol)

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label))
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(_get_instance_param_map(ova)))
def test_fit_with_params(dataset_binomial):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression()
    extra_params = {lr.maxIter: 3, lr.standardization: False}
    with mlflow.start_run() as run:
        lr_model = lr.fit(dataset_binomial, params=extra_params)
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(_get_instance_param_map(lr.copy(extra_params))))
    assert run_data.tags == get_expected_class_tags(lr)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == lr_model.uid
def test_get_instance_param_map(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = _get_instance_param_map(lor)
    assert (lor_params["maxIter"] == 3 and not lor_params["standardization"]
            and lor_params["family"] == lor.getOrDefault(lor.family))

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = _get_instance_param_map(ova)
    assert (ova_params["classifier"] == lor.uid
            and ova_params["labelCol"] == "abcd"
            and ova_params[f"{lor.uid}.maxIter"] == 3 and
            ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family))

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = _get_instance_param_map(pipeline)
    nested_pipeline_params = _get_instance_param_map(nested_pipeline)

    assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid]
    assert nested_pipeline_params["stages"] == [
        tokenizer.uid,
        {
            inner_pipeline.uid: [hashingTF.uid, ova.uid]
        },
    ]

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text"
                and params_to_test[f"{tokenizer.uid}.outputCol"] == "words")
        assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features"
        assert params_to_test[f"{ova.uid}.classifier"] == lor.uid
        assert params_to_test[f"{lor.uid}.maxIter"] == 3
def test_meta_estimator_fit(dataset_binomial):
    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        svc = LinearSVC()
        ova = OneVsRest(classifier=svc)
        ova_model = ova.fit(dataset_binomial)

    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(_get_instance_param_map(ova)))
    assert run_data.tags == get_expected_class_tags(ova)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == ova_model.uid

    # assert no nested run spawned
    query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id)
    assert len(mlflow.search_runs([run.info.experiment_id])) == 1
    assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0
def test_basic_estimator(dataset_binomial):
    mlflow.pyspark.ml.autolog()

    for estimator in [
            LinearRegression(),
            MultilayerPerceptronClassifier(layers=[2, 2, 2],
                                           seed=123,
                                           blockSize=1),
    ]:
        with mlflow.start_run() as run:
            model = estimator.fit(dataset_binomial)
        run_id = run.info.run_id
        run_data = get_run_data(run_id)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values(_get_instance_param_map(estimator)))
        assert run_data.tags == get_expected_class_tags(estimator)
        if isinstance(estimator, MultilayerPerceptronClassifier):
            assert MODEL_DIR not in run_data.artifacts
        else:
            assert MODEL_DIR in run_data.artifacts
            loaded_model = load_model_by_run_id(run_id)
            assert loaded_model.stages[0].uid == model.uid
def test_param_search_estimator(  # pylint: disable=unused-argument
        metric_name, param_search_estimator, spark_session,
        dataset_regression):
    mlflow.pyspark.ml.autolog()
    lr = LinearRegression(solver="l-bfgs", regParam=0.01)
    lrParamMaps = [
        {
            lr.maxIter: 1,
            lr.standardization: False
        },
        {
            lr.maxIter: 200,
            lr.standardization: True
        },
        {
            lr.maxIter: 2,
            lr.standardization: False
        },
    ]
    best_params = {
        "LinearRegression.maxIter": 200,
        "LinearRegression.standardization": True
    }
    eva = RegressionEvaluator(metricName=metric_name)
    estimator = param_search_estimator(estimator=lr,
                                       estimatorParamMaps=lrParamMaps,
                                       evaluator=eva)
    with mlflow.start_run() as run:
        model = estimator.fit(dataset_regression)
        estimator_info = load_json_artifact("estimator_info.json")
        metadata = _gen_estimator_metadata(estimator)
        assert metadata.hierarchy == estimator_info["hierarchy"]

        param_search_estiamtor_info = estimator_info[
            metadata.uid_to_indexed_name_map[estimator.uid]]
        assert param_search_estiamtor_info[
            "tuned_estimator_parameter_map"] == _get_instance_param_map_recursively(
                lr, 1, metadata.uid_to_indexed_name_map)
        assert param_search_estiamtor_info[
            "tuning_parameter_map_list"] == _get_tuning_param_maps(
                estimator, metadata.uid_to_indexed_name_map)

        assert best_params == load_json_artifact("best_parameters.json")

        search_results = load_json_csv("search_results.csv")

    uid_to_indexed_name_map = metadata.uid_to_indexed_name_map
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values({
            **_get_instance_param_map(estimator, uid_to_indexed_name_map),
            **{f"best_{k}": v
               for k, v in best_params.items()},
        }))
    assert run_data.tags == get_expected_class_tags(estimator)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == model.uid
    loaded_best_model = load_model_by_run_id(run_id, "best_model")
    assert loaded_best_model.stages[0].uid == model.bestModel.uid
    assert run_data.artifacts == [
        "best_model",
        "best_parameters.json",
        "estimator_info.json",
        "model",
        "search_results.csv",
    ]

    client = mlflow.tracking.MlflowClient()
    child_runs = client.search_runs(
        run.info.experiment_id,
        "tags.`mlflow.parentRunId` = '{}'".format(run_id))
    assert len(child_runs) == len(search_results)

    for row_index, row in search_results.iterrows():
        row_params = json.loads(row.get("params", "{}"))
        for param_name, param_value in row_params.items():
            assert param_value == row.get(f"param.{param_name}")

        params_search_clause = " and ".join([
            "params.`{}` = '{}'".format(key.split(".")[1], value)
            for key, value in row_params.items()
        ])
        search_filter = "tags.`mlflow.parentRunId` = '{}' and {}".format(
            run_id, params_search_clause)
        child_runs = client.search_runs(run.info.experiment_id, search_filter)
        assert len(child_runs) == 1
        child_run = child_runs[0]
        assert child_run.info.status == RunStatus.to_string(RunStatus.FINISHED)
        run_data = get_run_data(child_run.info.run_id)
        child_estimator = estimator.getEstimator().copy(
            estimator.getEstimatorParamMaps()[row_index])
        assert run_data.tags == get_expected_class_tags(child_estimator)
        assert run_data.params == truncate_param_dict(
            stringify_dict_values({
                **_get_instance_param_map(child_estimator, uid_to_indexed_name_map)
            }))
        assert (child_run.data.tags.get(MLFLOW_AUTOLOGGING) ==
                mlflow.pyspark.ml.AUTOLOGGING_INTEGRATION_NAME)

        metric_name = estimator.getEvaluator().getMetricName()
        if isinstance(estimator, CrossValidator):
            avg_metric_value = model.avgMetrics[row_index]
            avg_metric_name = f"avg_{metric_name}"
        else:
            avg_metric_value = model.validationMetrics[row_index]
            avg_metric_name = metric_name

        assert math.isclose(avg_metric_value,
                            run_data.metrics[avg_metric_name],
                            rel_tol=1e-6)
        assert math.isclose(avg_metric_value,
                            float(row.get(avg_metric_name)),
                            rel_tol=1e-6)

        if isinstance(estimator, CrossValidator) and Version(
                pyspark.__version__) >= Version("3.3"):
            std_metric_name = f"std_{metric_name}"
            std_metric_value = model.stdMetrics[row_index]
            assert math.isclose(std_metric_value,
                                run_data.metrics[std_metric_name],
                                rel_tol=1e-6)
            assert math.isclose(std_metric_value,
                                float(row.get(std_metric_name)),
                                rel_tol=1e-6)
def get_params_to_log(estimator):
    metadata = _gen_estimator_metadata(estimator)
    return _get_instance_param_map(estimator, metadata.uid_to_indexed_name_map)