def test_train_model_followed_by_score_model_returns_correctly( self, pipeline_logistic: Pipeline, train_iris_dataset): model = Model(pipeline_logistic) model.train_estimator(train_iris_dataset) model.score_estimator(train_iris_dataset) assert isinstance(model.result, Result)
def test_score_estimator_creates_train_test_data_classification( self, iris_dataset, train_iris_dataset): model = Model(LogisticRegression()) data = iris_dataset() model.score_estimator(data) test = train_iris_dataset pd.testing.assert_frame_equal(data.test_x, test.test_x) assert np.array_equal(data.test_y, test.test_y) pd.testing.assert_frame_equal(data.train_x, test.train_x) assert np.array_equal(data.train_y, test.train_y)
def test_score_estimator_creates_train_test_data(self, boston_dataset, train_boston_dataset): model = Model(LinearRegression()) data = boston_dataset() model.score_estimator(data) test = train_boston_dataset pd.testing.assert_frame_equal(data.test_x, test.test_x) assert np.array_equal(data.test_y, test.test_y) pd.testing.assert_frame_equal(data.train_x, test.train_x) assert np.array_equal(data.train_y, test.train_y)
def test_regression_model_can_be_saved(self, classifier: Model, tmp_path: pathlib.Path, train_iris_dataset): classifier.score_estimator(train_iris_dataset) load_storage = FileStorage(tmp_path) storage = FileStorage(tmp_path) saved_model_path = classifier.save_estimator(storage) assert saved_model_path.exists() loaded_model = classifier.load_estimator(saved_model_path, storage=load_storage) assert loaded_model.estimator.get_params( ) == classifier.estimator.get_params()
def test_rare_feature_encoder_works_gridsearch(self, train_iris_dataset: Dataset, rare: RareFeatureEncoder): grid = create_gridsearch(rare) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_has_correct_title_when_using_trees(self, dataset: Dataset): """Expect the plot to not have Class in the title""" model = Model(RandomForestClassifier()) result = model.score_estimator(dataset) ax = result.plot.feature_importance(class_index=10) assert "Class 10" not in ax.title.get_text()
def test_score_estimator_creates_train_test_data_with_changed_config_and_classification_data( self, iris_dataset): model = Model(LogisticRegression()) model.config.RANDOM_STATE = 1 model.config.TEST_SIZE = 0.50 data = iris_dataset() model.score_estimator(data) test = iris_dataset() test.create_train_test(stratify=True, seed=1, test_size=0.50) pd.testing.assert_frame_equal(data.test_x, test.test_x) assert np.array_equal(data.test_y, test.test_y) pd.testing.assert_frame_equal(data.train_x, test.train_x) assert np.array_equal(data.train_y, test.train_y) model.config.reset_config()
def test_roc_curve_fails_correctly_without_predict_proba(self): dataset = load_demo_dataset("iris") svc = Model(SVC(gamma="scale")) result = svc.score_estimator(dataset) with pytest.raises(VizError): result.plot.roc_curve() plt.close()
def test_has_correct_xlabel_when_using_trees(self, dataset: Dataset): """Expect plotting feature_importance of a RandomForest to show Feature Importances as xlabels instead of coef""" model = Model(RandomForestClassifier()) result = model.score_estimator(dataset) ax = result.plot.feature_importance() assert ax.get_xlabel() == "Feature Importances" plt.close()
def test_can_score_estimator_with_multiple_metrics(self, train_iris_dataset): model = Model(LogisticRegression(solver="liblinear")) result = model.score_estimator(train_iris_dataset, metrics=["accuracy", "roc_auc"]) assert len(result.metrics) == 2 assert "accuracy" in result.metrics assert "roc_auc" in result.metrics
def test_has_correct_xlabel_when_using_trees(self, dataset: Dataset): """Expect plotting feature_importance of a RandomForest to show Feature Importances as xlabels instead of coef""" model = Model(RandomForestClassifier()) result = model.score_estimator(dataset) ax = result.plot.permutation_importance() assert (ax.get_xlabel() == "Permuted Feature Importance (Accuracy) Relative to Baseline") plt.close()
def test_score_estimator_creates_train_test_data_with_changed_config( self, boston_dataset): model = Model(LinearRegression()) model.config.RANDOM_STATE = 1 model.config.TEST_SIZE = 0.5 model.config.TRAIN_TEST_SHUFFLE = False data = boston_dataset() model.score_estimator(data) test = boston_dataset() test.create_train_test(stratify=False, shuffle=False, seed=1, test_size=0.5) pd.testing.assert_frame_equal(data.test_x, test.test_x) assert np.array_equal(data.test_y, test.test_y) pd.testing.assert_frame_equal(data.train_x, test.train_x) assert np.array_equal(data.train_y, test.train_y) model.config.reset_config()
def test_raises_if_passed_model_without_feature_importance_or_coefs( self, dataset: Dataset): """ Expect an exception if trying to plot an estimator that doesn't have coefficients or feature_importance """ model = Model(KNeighborsClassifier()) result = model.score_estimator(dataset) with pytest.raises(VizError): result.plot.feature_importance()
def test_pr_curve_fails_correctly_without_predict_proba(self): """ Expect that the plot will raise an exception if the estimator does not have a predict_proba method """ dataset = load_demo_dataset("iris") svc = Model(SVC(gamma="scale")) result = svc.score_estimator(dataset) with pytest.raises(VizError): result.plot.precision_recall_curve() plt.close()
def test_log_context_manager_logs_when_scoring_model( self, tmp_path: pathlib.Path, train_iris_dataset): model = Model(LinearRegression()) runs = tmp_path / "runs" with model.log(str(runs)): result = model.score_estimator(train_iris_dataset) for file in runs.rglob("LinearRegression_*"): with file.open() as f: log_result = yaml.safe_load(f) assert result.metrics.score == log_result["metrics"]["r2"] assert result.model.estimator_name == log_result["estimator_name"]
def test_plots_have_correct_title_when_using_pipeline( self, dataset: Dataset): """ Expect plots to work correctly with pipelines, showing the title of the estimator and not Pipeline """ pipe = Pipeline([ ("scale", DFStandardScaler()), ("clf", RandomForestClassifier(n_estimators=10)), ]) model = Model(pipe) result = model.score_estimator(dataset) ax = result.plot.permutation_importance() assert (ax.title.get_text() == "Permutation Importances (Accuracy) - RandomForestClassifier") assert 4 == len(list(ax.get_yticklabels())) plt.close()
def train_model(year, month, day, graphs=True, clf=RandomForestRegressor()): dataset = AirBnBDataset(year=year, month=month, day=day) dataset.create_train_test() model = Model(clf, feature_pipeline=features) result = model.score_estimator(dataset) model.config.N_JOBS = 6 with model.log("randomforest"): model.save_estimator() if graphs: result.plot.feature_importance() plt.savefig(VISUALIZATIONS / "confusion_matrix.png") result.plot.residuals() plt.savefig(VISUALIZATIONS / "residuals.png") result.plot.prediction_error() plt.savefig(VISUALIZATIONS / "prediction_error.png") return result
def test_can_score_estimator_with_default_metric(self, train_iris_dataset): model = Model(LogisticRegression(solver="liblinear")) result = model.score_estimator(train_iris_dataset) assert result.metrics.name == "accuracy"
def classifier_result(self) -> Result: """Setup a classiifer Result""" dataset = load_demo_dataset("iris") model = Model(LogisticRegression()) return model.score_estimator(dataset)
def result_cv(self, model: Model) -> Result: """Setup a Result from a cross-validated scoring""" dataset = load_demo_dataset("boston") return model.score_estimator(dataset, cv=2)
def result(self, model: Model) -> Result: """Setup a Result from a score_estimator without cv""" dataset = load_demo_dataset("boston") return model.score_estimator(dataset)
def test_df_selector_works_gridsearch(self, train_iris_dataset): grid = create_gridsearch(Select("sepal length (cm)")) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_binarize_works_in_gridsearch(self, train_iris_dataset): grid = create_gridsearch(Binarize(value=2)) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_func_transformer_works_in_gridsearch(self, train_iris_dataset): grid = create_gridsearch(FuncTransformer(np.mean)) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_dfrowfunc_works_in_gridsearch(self, train_iris_dataset): grid = create_gridsearch(DFRowFunc(strategy="mean")) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_standard_scaler_works_in_gridsearch(self, train_iris_dataset): grid = create_gridsearch(DFStandardScaler()) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_renamer_works_gridsearch(self, train_iris_dataset): grid = create_gridsearch(Renamer(["1", "2", "3", "4"])) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_to_categorical_works_gridsearch(self, train_iris_dataset): grid = create_gridsearch(ToCategorical()) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def regression_result(self) -> Result: """Setup a regression Result""" dataset = load_demo_dataset("boston") model = Model(LinearRegression()) return model.score_estimator(dataset)
def test_can_score_estimator_with_specified_metric(self, train_iris_dataset): model = Model(LogisticRegression(solver="liblinear")) result = model.score_estimator(train_iris_dataset, metrics="roc_auc") assert result.metrics.name == "roc_auc"