Beispiel #1
0
    def evaluate(self,
                 model,
                 model_type,
                 dataset,
                 run_id,
                 evaluator_config=None,
                 **kwargs) -> EvaluationResult:
        client = mlflow.tracking.MlflowClient()
        X, y = dataset._extract_features_and_labels()
        y_pred = model.predict(X)
        if model_type == "classifier":
            accuracy_score = sk_metrics.accuracy_score(y, y_pred)

            metrics = EvaluationMetrics(accuracy_score=accuracy_score)
            self._log_metrics(run_id, metrics, dataset.name)
            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
            confusion_matrix_artifact = Array2DEvaluationArtifact(
                uri=get_artifact_uri(run_id, confusion_matrix_artifact_name),
                content=confusion_matrix,
            )
            confusion_matrix_csv_buff = io.StringIO()
            confusion_matrix_artifact.save(confusion_matrix_csv_buff)
            client.log_text(run_id, confusion_matrix_csv_buff.getvalue(),
                            confusion_matrix_artifact_name)
            artifacts = {
                confusion_matrix_artifact_name: confusion_matrix_artifact
            }
        elif model_type == "regressor":
            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
            metrics = EvaluationMetrics(
                mean_absolute_error=mean_absolute_error,
                mean_squared_error=mean_squared_error)
            self._log_metrics(run_id, metrics, dataset.name)
            artifacts = {}
        else:
            raise ValueError(f"Unsupported model type {model_type}")

        return EvaluationResult(metrics=metrics, artifacts=artifacts)
Beispiel #2
0
def test_evaluate_with_multi_evaluators(
        multiclass_logistic_regressor_model_uri, iris_dataset):
    with mock.patch.object(
            _model_evaluation_registry,
            "_registry",
        {
            "test_evaluator1": FakeEvauator1,
            "test_evaluator2": FakeEvauator2
        },
    ):
        evaluator1_config = {"eval1_confg": 3}
        evaluator2_config = {"eval2_confg": 4}
        evaluator1_return_value = EvaluationResult(
            metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")})
        evaluator2_return_value = EvaluationResult(
            metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")})

        # evaluators = None is the case evaluators unspecified, it should fetch all registered
        # evaluators, and the evaluation results should equal to the case of
        # evaluators=["test_evaluator1", "test_evaluator2"]
        for evaluators in [None, ["test_evaluator1", "test_evaluator2"]]:
            with mock.patch.object(
                    FakeEvauator1, "can_evaluate", return_value=True
            ) as mock_can_evaluate1, mock.patch.object(
                    FakeEvauator1,
                    "evaluate",
                    return_value=evaluator1_return_value
            ) as mock_evaluate1, mock.patch.object(
                    FakeEvauator2, "can_evaluate", return_value=True
            ) as mock_can_evaluate2, mock.patch.object(
                    FakeEvauator2,
                    "evaluate",
                    return_value=evaluator2_return_value) as mock_evaluate2:
                classifier_model = mlflow.pyfunc.load_model(
                    multiclass_logistic_regressor_model_uri)
                with mlflow.start_run() as run:
                    eval_result = evaluate(
                        classifier_model,
                        iris_dataset._constructor_args["data"],
                        model_type="classifier",
                        targets=iris_dataset._constructor_args["targets"],
                        dataset_name=iris_dataset.name,
                        evaluators=evaluators,
                        evaluator_config={
                            "test_evaluator1": evaluator1_config,
                            "test_evaluator2": evaluator2_config,
                        },
                    )
                    assert eval_result.metrics == {
                        **evaluator1_return_value.metrics,
                        **evaluator2_return_value.metrics,
                    }
                    assert eval_result.artifacts == {
                        **evaluator1_return_value.artifacts,
                        **evaluator2_return_value.artifacts,
                    }
                    mock_can_evaluate1.assert_called_once_with(
                        model_type="classifier",
                        evaluator_config=evaluator1_config)
                    mock_evaluate1.assert_called_once_with(
                        model=classifier_model,
                        model_type="classifier",
                        dataset=iris_dataset,
                        run_id=run.info.run_id,
                        evaluator_config=evaluator1_config,
                    )
                    mock_can_evaluate2.assert_called_once_with(
                        model_type="classifier",
                        evaluator_config=evaluator2_config,
                    )
                    mock_evaluate2.assert_called_once_with(
                        model=classifier_model,
                        model_type="classifier",
                        dataset=iris_dataset,
                        run_id=run.info.run_id,
                        evaluator_config=evaluator2_config,
                    )
Beispiel #3
0
    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config,
                 **kwargs) -> EvaluationResult:
        client = mlflow.tracking.MlflowClient()
        X = dataset.features_data
        y = dataset.labels_data
        y_pred = model.predict(X)
        if model_type == "classifier":
            accuracy_score = sk_metrics.accuracy_score(y, y_pred)

            metrics = {"accuracy_score": accuracy_score}
            self._log_metrics(run_id, metrics, dataset.name)
            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}"
            confusion_matrix_artifact = Array2DEvaluationArtifact(
                uri=get_artifact_uri(run_id,
                                     confusion_matrix_artifact_name + ".csv"),
                content=confusion_matrix,
            )
            confusion_matrix_csv_buff = io.StringIO()
            confusion_matrix_artifact._save(confusion_matrix_csv_buff)
            client.log_text(
                run_id,
                confusion_matrix_csv_buff.getvalue(),
                confusion_matrix_artifact_name + ".csv",
            )

            confusion_matrix_figure = sk_metrics.ConfusionMatrixDisplay.from_predictions(
                y, y_pred).figure_
            img_buf = io.BytesIO()
            confusion_matrix_figure.savefig(img_buf)
            img_buf.seek(0)
            confusion_matrix_image = Image.open(img_buf)

            confusion_matrix_image_artifact_name = f"confusion_matrix_image_on_{dataset.name}"
            confusion_matrix_image_artifact = ImageEvaluationArtifact(
                uri=get_artifact_uri(
                    run_id, confusion_matrix_image_artifact_name + ".png"),
                content=confusion_matrix_image,
            )
            confusion_matrix_image_artifact._save(
                confusion_matrix_image_artifact_name + ".png")
            client.log_image(run_id, confusion_matrix_image,
                             confusion_matrix_image_artifact_name + ".png")

            artifacts = {
                confusion_matrix_artifact_name:
                confusion_matrix_artifact,
                confusion_matrix_image_artifact_name:
                confusion_matrix_image_artifact,
            }
        elif model_type == "regressor":
            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
            metrics = {
                "mean_absolute_error": mean_absolute_error,
                "mean_squared_error": mean_squared_error,
            }
            self._log_metrics(run_id, metrics, dataset.name)
            artifacts = {}
        else:
            raise ValueError(f"Unsupported model type {model_type}")

        return EvaluationResult(metrics=metrics, artifacts=artifacts)
Beispiel #4
0
def test_evaluator_interface(multiclass_logistic_regressor_model_uri,
                             iris_dataset):
    with mock.patch.object(_model_evaluation_registry, "_registry",
                           {"test_evaluator1": FakeEvauator1}):
        evaluator1_config = {"eval1_confg_a": 3, "eval1_confg_b": 4}
        evaluator1_return_value = EvaluationResult(
            metrics={
                "m1": 5,
                "m2": 6
            },
            artifacts={
                "a1": FakeArtifact1(uri="uri1"),
                "a2": FakeArtifact2(uri="uri2")
            },
        )
        with mock.patch.object(
                FakeEvauator1, "can_evaluate",
                return_value=False) as mock_can_evaluate, mock.patch.object(
                    FakeEvauator1,
                    "evaluate",
                    return_value=evaluator1_return_value) as mock_evaluate:
            with mlflow.start_run():
                with pytest.raises(
                        ValueError,
                        match=
                        "The model could not be evaluated by any of the registered evaluators",
                ):
                    evaluate(
                        multiclass_logistic_regressor_model_uri,
                        data=iris_dataset._constructor_args["data"],
                        model_type="classifier",
                        targets=iris_dataset._constructor_args["targets"],
                        dataset_name=iris_dataset.name,
                        evaluators="test_evaluator1",
                        evaluator_config=evaluator1_config,
                    )
                mock_can_evaluate.assert_called_once_with(
                    model_type="classifier",
                    evaluator_config=evaluator1_config)
                mock_evaluate.assert_not_called()
        with mock.patch.object(
                FakeEvauator1, "can_evaluate",
                return_value=True) as mock_can_evaluate, mock.patch.object(
                    FakeEvauator1,
                    "evaluate",
                    return_value=evaluator1_return_value) as mock_evaluate:
            classifier_model = mlflow.pyfunc.load_model(
                multiclass_logistic_regressor_model_uri)
            with mlflow.start_run() as run:
                eval1_result = evaluate(
                    classifier_model,
                    iris_dataset._constructor_args["data"],
                    model_type="classifier",
                    targets=iris_dataset._constructor_args["targets"],
                    dataset_name=iris_dataset.name,
                    evaluators="test_evaluator1",
                    evaluator_config=evaluator1_config,
                )
                assert eval1_result.metrics == evaluator1_return_value.metrics
                assert eval1_result.artifacts == evaluator1_return_value.artifacts

                mock_can_evaluate.assert_called_once_with(
                    model_type="classifier",
                    evaluator_config=evaluator1_config)
                mock_evaluate.assert_called_once_with(
                    model=classifier_model,
                    model_type="classifier",
                    dataset=iris_dataset,
                    run_id=run.info.run_id,
                    evaluator_config=evaluator1_config,
                )