def test_rare_feature_encoder_works_gridsearch(self,
                                                train_iris_dataset: Dataset,
                                                rare: RareFeatureEncoder):
     grid = create_gridsearch(rare)
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
Esempio n. 2
0
    def test_from_yaml_serializes_correctly_with_feature_union(
            self, feature_union_classifier: DFFeatureUnion,
            tmp_path: pathlib.Path):

        model = Model(feature_union_classifier)
        result = model.to_dict()

        log = Log(name="test",
                  metrics=Metrics.from_list(["accuracy"]),
                  estimator=result)
        log.save_log(tmp_path)

        new_model = Model.from_yaml(log.output_path)

        assert len(new_model.estimator.steps[0][1].transformer_list) == 2
        new_steps = new_model.estimator.steps
        old_steps = model.estimator.steps

        assert new_steps[0][0] == old_steps[0][0]
        assert isinstance(new_steps[0][1], type(old_steps[0][1]))

        new_union = new_steps[0][1].transformer_list
        old_union = old_steps[0][1].transformer_list

        assert len(new_union) == len(old_union)

        for new_transform, old_transform in zip(new_union, old_union):
            assert new_transform[1].steps[0][0] == old_transform[1].steps[0][0]
            assert (new_transform[1].steps[0][1].get_params() ==
                    old_transform[1].steps[0][1].get_params())
Esempio n. 3
0
    def test_can_score_estimator_with_no_y_value(self):
        class DummyEstimator(BaseEstimator, RegressorMixin):
            def __init__(self):
                self.average = None

            def fit(self, x, y=None):
                self.average = np.mean(x, axis=0)
                return self

            def predict(self, x):
                return self.average

        class DummyData(Dataset):
            def load_training_data(self):
                return pd.DataFrame({
                    "col1": [1, 2, 3, 4],
                    "col2": [4, 5, 6, 7]
                }), None

            def load_prediction_data(self, *args, **kwargs):
                return pd.DataFrame({
                    "col1": [1, 2, 3, 4],
                    "col2": [4, 5, 6, 7]
                })

        model = Model(DummyEstimator())
        data = DummyData()
        model.train_estimator(data)

        assert np.all(np.isclose(model.estimator.average, np.array([2.5,
                                                                    5.5])))

        with pytest.raises(DatasetError,
                           match="The dataset does not define a y value"):
            data.create_train_test()
Esempio n. 4
0
 def test_roc_curve_fails_correctly_without_predict_proba(self):
     dataset = load_demo_dataset("iris")
     svc = Model(SVC(gamma="scale"))
     result = svc.score_estimator(dataset)
     with pytest.raises(VizError):
         result.plot.roc_curve()
     plt.close()
Esempio n. 5
0
    def test_has_correct_title_when_using_trees(self, dataset: Dataset):
        """Expect the plot to not have Class in the title"""
        model = Model(RandomForestClassifier())
        result = model.score_estimator(dataset)

        ax = result.plot.feature_importance(class_index=10)
        assert "Class 10" not in ax.title.get_text()
Esempio n. 6
0
 def test_has_correct_xlabel_when_using_trees(self, dataset: Dataset):
     """Expect plotting feature_importance of a RandomForest to show Feature Importances as
     xlabels instead of coef"""
     model = Model(RandomForestClassifier())
     result = model.score_estimator(dataset)
     ax = result.plot.feature_importance()
     assert ax.get_xlabel() == "Feature Importances"
     plt.close()
Esempio n. 7
0
    def test_save_estimator_with_prod_flag_saves_correctly(
            self, classifier: Model):
        mock_storage = MagicMock()
        classifier.save_estimator(mock_storage, prod=True)

        mock_storage.save.assert_called_once_with(classifier.estimator,
                                                  "production_model.pkl",
                                                  prod=True)
Esempio n. 8
0
 def test_save_model_saves_pipeline_correctly(self,
                                              pipeline_logistic: Pipeline,
                                              tmp_path: pathlib.Path,
                                              train_iris_dataset):
     model = Model(pipeline_logistic)
     model.train_estimator(train_iris_dataset)
     saved_model_path = model.save_estimator(FileStorage(tmp_path))
     assert saved_model_path.exists()
Esempio n. 9
0
 def test_has_correct_xlabel_when_using_trees(self, dataset: Dataset):
     """Expect plotting feature_importance of a RandomForest to show Feature Importances as
     xlabels instead of coef"""
     model = Model(RandomForestClassifier())
     result = model.score_estimator(dataset)
     ax = result.plot.permutation_importance()
     assert (ax.get_xlabel() ==
             "Permuted Feature Importance (Accuracy) Relative to Baseline")
     plt.close()
Esempio n. 10
0
 def test_can_list_estimators(self, classifier: Model,
                              tmp_path: pathlib.Path):
     storage = FileStorage(tmp_path)
     for _ in range(3):
         classifier.save_estimator(storage)
     storage_context = FileStorage(tmp_path)
     filenames_list = Model.list_estimators(storage_context)
     for filename in filenames_list:
         assert filename.exists()
Esempio n. 11
0
    def test_can_save_with_model(self, classifier: Model,
                                 tmp_path: pathlib.Path):
        storage = FileStorage(tmp_path)
        expected_file = classifier.save_estimator(storage)
        assert expected_file.exists()

        storage_context = FileStorage(tmp_path)
        context_expected_file = classifier.save_estimator(storage_context)
        assert context_expected_file.exists()
Esempio n. 12
0
    def test_can_score_estimator_with_multiple_metrics(self,
                                                       train_iris_dataset):
        model = Model(LogisticRegression(solver="liblinear"))
        result = model.score_estimator(train_iris_dataset,
                                       metrics=["accuracy", "roc_auc"])

        assert len(result.metrics) == 2
        assert "accuracy" in result.metrics
        assert "roc_auc" in result.metrics
    def test_load_prediction_data_works_as_expected(self):
        dataset = load_demo_dataset("iris")
        dataset.create_train_test(stratify=True)
        feature_pipeline = Pipeline([("scale", DFStandardScaler())])
        model = Model(LogisticRegression(), feature_pipeline=feature_pipeline)
        model.train_estimator(dataset)
        result = model.make_prediction(dataset, 5)

        expected = pd.DataFrame({"Prediction": [0]})
        pd.testing.assert_frame_equal(result, expected, check_dtype=False)
Esempio n. 14
0
 def test_default_metric_getter_works_as_expected_regressor(self):
     linreg = Model(LinearRegression())
     assert linreg.config.CLASSIFIER_METRIC == "accuracy"
     assert linreg.config.REGRESSION_METRIC == "r2"
     assert linreg.default_metric == "r2"
     linreg.default_metric = "neg_mean_squared_error"
     assert linreg.config.CLASSIFIER_METRIC == "accuracy"
     assert linreg.config.REGRESSION_METRIC == "neg_mean_squared_error"
     assert linreg.default_metric == "neg_mean_squared_error"
     linreg.config.reset_config()
Esempio n. 15
0
 def test_gridsearch_can_log_with_context_manager(self,
                                                  feature_union_classifier,
                                                  train_iris_dataset,
                                                  tmp_path):
     classifier = Model(feature_union_classifier)
     classifier.config.RUN_DIR = tmp_path
     with classifier.log("gridsearch_union_test"):
         _, _ = classifier.gridsearch(
             train_iris_dataset,
             param_grid={"estimator__penalty": ["l1", "l2"]})
Esempio n. 16
0
 def test_default_metric_getter_works_as_expected_classifier(self):
     rf = Model(RandomForestClassifier(n_estimators=10))
     assert rf.config.CLASSIFIER_METRIC == "accuracy"
     assert rf.config.REGRESSION_METRIC == "r2"
     assert rf.default_metric == "accuracy"
     rf.default_metric = "fowlkes_mallows_score"
     assert rf.config.CLASSIFIER_METRIC == "fowlkes_mallows_score"
     assert rf.config.REGRESSION_METRIC == "r2"
     assert rf.default_metric == "fowlkes_mallows_score"
     rf.config.reset_config()
Esempio n. 17
0
    def test_save_estimator_uses_default_storage_if_no_storage_is_passed(
            self, tmp_path: pathlib.Path, classifier: Model):
        classifier.config.ESTIMATOR_DIR = tmp_path
        classifier.save_estimator()

        models = classifier.config.default_storage.get_list()
        assert len(models) == 1
        new_classifier = Model.load_estimator(models[0])
        assert (classifier.estimator.get_params() ==
                new_classifier.estimator.get_params())
Esempio n. 18
0
 def test_randomsearch_can_log_with_context_manager(
         self, feature_union_classifier, train_iris_dataset, tmp_path):
     classifier = Model(feature_union_classifier)
     classifier.config.RUN_DIR = tmp_path
     with classifier.log("randomsearch_union_test"):
         _, _ = classifier.randomsearch(
             train_iris_dataset,
             param_distributions={"estimator__penalty": ["l1", "l2"]},
             n_iter=2,
         )
Esempio n. 19
0
    def test_raises_if_passed_model_without_feature_importance_or_coefs(
            self, dataset: Dataset):
        """
        Expect an exception if trying to plot an estimator that doesn't have
        coefficients or feature_importance
        """
        model = Model(KNeighborsClassifier())
        result = model.score_estimator(dataset)

        with pytest.raises(VizError):
            result.plot.feature_importance()
Esempio n. 20
0
 def test_can_load_with_model(self, classifier: Model,
                              tmp_path: pathlib.Path):
     storage = FileStorage(tmp_path)
     expected_file = classifier.save_estimator(storage)
     assert expected_file.exists()
     loaded_file = classifier.load_estimator(expected_file, storage=storage)
     assert isinstance(loaded_file, Model)
     storage_context = FileStorage(tmp_path)
     context_loaded_file = classifier.load_estimator(
         expected_file, storage=storage_context)
     assert isinstance(context_loaded_file, Model)
 def test_pr_curve_fails_correctly_without_predict_proba(self):
     """
     Expect that the plot will raise an exception if the estimator
     does not have a predict_proba method
     """
     dataset = load_demo_dataset("iris")
     svc = Model(SVC(gamma="scale"))
     result = svc.score_estimator(dataset)
     with pytest.raises(VizError):
         result.plot.precision_recall_curve()
     plt.close()
Esempio n. 22
0
 def test_default_metric_works_as_expected_without_pipeline(self):
     rf = Model(RandomForestClassifier(n_estimators=10))
     linreg = Model(LinearRegression())
     assert "accuracy" == rf.default_metric
     assert "r2" == linreg.default_metric
     rf.config.CLASSIFIER_METRIC = "fowlkes_mallows_score"
     linreg.config.REGRESSION_METRIC = "neg_mean_squared_error"
     assert "fowlkes_mallows_score" == rf.default_metric
     assert "neg_mean_squared_error" == linreg.default_metric
     rf.config.reset_config()
     linreg.config.reset_config()
Esempio n. 23
0
 def test_can_load_serialized_model_from_estimator(self, classifier: Model,
                                                   tmp_path: pathlib.Path):
     log = Log(
         name="test",
         estimator=classifier.to_dict(),
         metrics=Metrics([Metric("accuracy", score=1.0)]),
     )
     log.save_log(tmp_path)
     model2 = Model.from_yaml(log.output_path)
     assert model2.estimator.get_params(
     ) == classifier.estimator.get_params()
Esempio n. 24
0
    def test_make_prediction_with_regression_sqldataset_works_as_expected(
            self, boston_sqldataset, loaded_boston_db):
        dataset = boston_sqldataset(loaded_boston_db, schema=None)
        dataset.create_train_test(stratify=False)
        model = Model(LinearRegression())
        model.train_estimator(dataset)

        result = model.make_prediction(dataset, 0)

        assert result.shape == (1, 1)
        assert result.columns.tolist() == ["Prediction"]
Esempio n. 25
0
 def test_default_metric_works_as_expected_with_pipeline(
         self, pipeline_logistic: Pipeline, pipeline_linear: Pipeline):
     logreg = Model(pipeline_logistic)
     linreg = Model(pipeline_linear)
     assert "accuracy" == logreg.default_metric
     assert "r2" == linreg.default_metric
     logreg.config.CLASSIFIER_METRIC = "fowlkes_mallows_score"
     linreg.config.REGRESSION_METRIC = "neg_mean_squared_error"
     assert "fowlkes_mallows_score" == logreg.default_metric
     assert "neg_mean_squared_error" == linreg.default_metric
     logreg.config.reset_config()
     linreg.config.reset_config()
Esempio n. 26
0
    def test_bayessearch_best_model_is_not_fitted_if_refit_is_not_true(
            self, pipeline_logistic: Pipeline, train_iris_dataset: Dataset):

        model = Model(pipeline_logistic)
        model, results = model.bayesiansearch(
            train_iris_dataset,
            param_distributions={"estimator__penalty": ["l1", "l2"]},
            refit=False,
        )
        with pytest.raises(MLToolingError,
                           match="You haven't fitted the estimator"):
            model.make_prediction(data=train_iris_dataset, idx=1)
Esempio n. 27
0
    def test_save_estimator_saves_logging_dir_correctly(
            self, mock_hash: MagicMock, classifier: Model,
            tmp_path: pathlib.Path):
        mock_hash.return_value = "1234"

        with classifier.log(str(tmp_path)):
            expected_file = classifier.save_estimator(FileStorage(tmp_path))

        assert expected_file.exists()
        assert ("LogisticRegression"
                in [str(file) for file in tmp_path.rglob("*.yaml")][0])
        mock_hash.assert_called_once()
Esempio n. 28
0
    def test_score_estimator_creates_train_test_data(self, boston_dataset,
                                                     train_boston_dataset):
        model = Model(LinearRegression())
        data = boston_dataset()
        model.score_estimator(data)

        test = train_boston_dataset

        pd.testing.assert_frame_equal(data.test_x, test.test_x)
        assert np.array_equal(data.test_y, test.test_y)
        pd.testing.assert_frame_equal(data.train_x, test.train_x)
        assert np.array_equal(data.train_y, test.train_y)
Esempio n. 29
0
    def test_score_estimator_creates_train_test_data_classification(
            self, iris_dataset, train_iris_dataset):
        model = Model(LogisticRegression())
        data = iris_dataset()
        model.score_estimator(data)

        test = train_iris_dataset

        pd.testing.assert_frame_equal(data.test_x, test.test_x)
        assert np.array_equal(data.test_y, test.test_y)
        pd.testing.assert_frame_equal(data.train_x, test.train_x)
        assert np.array_equal(data.train_y, test.train_y)
Esempio n. 30
0
    def test_gridsearch_model_returns_as_expected(self,
                                                  pipeline_logistic: Pipeline,
                                                  train_iris_dataset):
        model = Model(pipeline_logistic)
        model, results = model.gridsearch(
            train_iris_dataset,
            param_grid={"estimator__penalty": ["l1", "l2"]})
        assert isinstance(model.estimator, Pipeline)
        assert 2 == len(results)

        for result in results:
            assert isinstance(result, Result)