def test_standard_scaler_with_std_false_returns_correct_dataframe( self, numerical: pd.DataFrame): numerical_scaled = numerical.copy() numerical_scaled["number_a"] = numerical["number_a"] - 2.5 numerical_scaled["number_b"] = numerical["number_b"] - 6.5 scaler = DFStandardScaler(with_std=False) result = scaler.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
def test_DFStandardScaler_returns_correct_dataframe(numerical): numerical_scaled = numerical.copy() numerical_scaled['number_a'] = (numerical['number_a'] - 2.5) / 1.118033988749895 numerical_scaled['number_b'] = (numerical['number_b'] - 6.5) / 1.118033988749895 scaler = DFStandardScaler() result = scaler.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
def test_standard_scaler_returns_correct_dataframe( self, numerical: pd.DataFrame): numerical_scaled = numerical.copy() numerical_scaled["number_a"] = (numerical["number_a"] - 2.5) / 1.118033988749895 numerical_scaled["number_b"] = (numerical["number_b"] - 6.5) / 1.118033988749895 scaler = DFStandardScaler() result = scaler.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
def feature_union_classifier() -> Pipeline: pipe1 = Pipeline([ ("select", Select(["sepal length (cm)", "sepal width (cm)"])), ("scale", DFStandardScaler()), ]) pipe2 = Pipeline([ ("select", Select(["petal length (cm)", "petal width (cm)"])), ("scale", DFStandardScaler()), ]) union = DFFeatureUnion(transformer_list=[("pipe1", pipe1), ("pipe2", pipe2)]) return Pipeline([("features", union), ("estimator", LogisticRegression(solver="liblinear"))])
def test_load_prediction_data_works_as_expected(self): dataset = load_demo_dataset("iris") dataset.create_train_test(stratify=True) feature_pipeline = Pipeline([("scale", DFStandardScaler())]) model = Model(LogisticRegression(), feature_pipeline=feature_pipeline) model.train_estimator(dataset) result = model.make_prediction(dataset, 5) expected = pd.DataFrame({"Prediction": [0]}) pd.testing.assert_frame_equal(result, expected, check_dtype=False)
def test_standard_scaler_works_in_pipeline_with_feature_union( self, numerical: pd.DataFrame): numerical_scaled = numerical.copy() numerical_scaled["number_a"] = (numerical["number_a"] - 2.5) / 1.118033988749895 numerical_scaled["number_b"] = (numerical["number_b"] - 6.5) / 1.118033988749895 union = DFFeatureUnion([("number_a", Select(["number_a"])), ("number_b", Select(["number_b"]))]) pipeline = make_pipeline(union, DFStandardScaler()) result = pipeline.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
def test_model_selection_works_with_feature_pipeline( self, train_iris_dataset: Dataset): estimators = [ RandomForestClassifier(), DummyClassifier(strategy="stratified") ] feature_pipeline = Pipeline([("scale", DFStandardScaler())]) best_estimator, results = Model.test_estimators( data=train_iris_dataset, estimators=estimators, feature_pipeline=feature_pipeline, ) expected = Pipeline([("features", feature_pipeline), ("estimator", estimators[0])]) assert best_estimator.estimator.get_params() == expected.get_params()
def test_DFStandardScaler_works_in_pipeline_with_DFFeatureUnion( categorical, numerical): numerical_scaled = numerical.copy() numerical_scaled['number_a'] = (numerical['number_a'] - 2.5) / 1.118033988749895 numerical_scaled['number_b'] = (numerical['number_b'] - 6.5) / 1.118033988749895 union = DFFeatureUnion([('number_a', Select(['number_a'])), ('number_b', Select(['number_b']))]) pipeline = make_pipeline( union, DFStandardScaler(), ) result = pipeline.fit_transform(numerical) pd.testing.assert_frame_equal(result, numerical_scaled)
def test_plots_have_correct_title_when_using_pipeline( self, dataset: Dataset): """ Expect plots to work correctly with pipelines, showing the title of the estimator and not Pipeline """ pipe = Pipeline([ ("scale", DFStandardScaler()), ("clf", RandomForestClassifier(n_estimators=10)), ]) model = Model(pipe) result = model.score_estimator(dataset) ax = result.plot.permutation_importance() assert (ax.title.get_text() == "Permutation Importances (Accuracy) - RandomForestClassifier") assert 4 == len(list(ax.get_yticklabels())) plt.close()
def test_standard_scaler_works_in_gridsearch(self, train_iris_dataset): grid = create_gridsearch(DFStandardScaler()) model = Model(grid) result = model.score_estimator(train_iris_dataset) assert isinstance(result, Result)
def test_standard_scaler_works_in_cv(self, train_iris_dataset): model = create_model(DFStandardScaler()) result = model.score_estimator(train_iris_dataset, cv=2) assert isinstance(result, Result)
def test_can_reset_scaler_parameters(self): scaler = DFStandardScaler() scaler.scale_ = 0.7 scaler.mean_ = 0.5 scaler._reset() assert hasattr(scaler, "scale_") is False
def test_works_without_args(self): assert DFStandardScaler()
def test_missing_data_can_pass_pipeline(self, missing_data: Dataset): pipeline = Pipeline([("scaler", DFStandardScaler())]) ax = missing_data.plot.missing_data(feature_pipeline=pipeline) assert [text.get_text() for text in ax.texts] == ["2.0%"] plt.close()
def pipeline_forest_classifier() -> Pipeline: pipe = Pipeline([ ("scale", DFStandardScaler()), ("estimator", RandomForestClassifier(n_estimators=10)), ]) return pipe
def pipeline_dummy_classifier() -> Pipeline: pipe = Pipeline([ ("scale", DFStandardScaler()), ("estimator", DummyClassifier(strategy="prior")), ]) return pipe