def test_get_params_to_log(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = get_params_to_log(lor) assert ( lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family) ) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = get_params_to_log(ova) assert ( ova_params["classifier"] == "LogisticRegression" and ova_params["labelCol"] == "abcd" and ova_params["LogisticRegression.maxIter"] == 3 and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family) ) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = get_params_to_log(pipeline) nested_pipeline_params = get_params_to_log(nested_pipeline) assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"] assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"] assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"] assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression" for params_to_test in [pipeline_params, nested_pipeline_params]: assert ( params_to_test["Tokenizer.inputCol"] == "text" and params_to_test["Tokenizer.outputCol"] == "words" ) assert params_to_test["HashingTF.outputCol"] == "features" assert params_to_test["OneVsRest.classifier"] == "LogisticRegression" assert params_to_test["LogisticRegression.maxIter"] == 3
def test_get_instance_param_map(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = _get_instance_param_map(lor) assert (lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family)) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = _get_instance_param_map(ova) assert (ova_params["classifier"] == lor.uid and ova_params["labelCol"] == "abcd" and ova_params[f"{lor.uid}.maxIter"] == 3 and ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family)) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = _get_instance_param_map(pipeline) nested_pipeline_params = _get_instance_param_map(nested_pipeline) assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid] assert nested_pipeline_params["stages"] == [ tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, ova.uid] }, ] for params_to_test in [pipeline_params, nested_pipeline_params]: assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text" and params_to_test[f"{tokenizer.uid}.outputCol"] == "words") assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features" assert params_to_test[f"{ova.uid}.classifier"] == lor.uid assert params_to_test[f"{lor.uid}.maxIter"] == 3
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = _get_instance_param_map(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == lor.uid assert param_map[f"{lor.uid}.maxIter"] == 3 assert not param_map[f"{lor.uid}.standardization"] assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(ova)))
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = get_params_to_log(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == "LogisticRegression" assert param_map["LogisticRegression.maxIter"] == 3 assert not param_map["LogisticRegression.standardization"] assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) metadata = _gen_estimator_metadata(ova) estimator_info = load_json_artifact("estimator_info.json") assert metadata.hierarchy == estimator_info["hierarchy"] run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))