def test_standard_scaler_with_std_false_returns_correct_dataframe(
            self, numerical: pd.DataFrame):
        numerical_scaled = numerical.copy()
        numerical_scaled["number_a"] = numerical["number_a"] - 2.5
        numerical_scaled["number_b"] = numerical["number_b"] - 6.5

        scaler = DFStandardScaler(with_std=False)
        result = scaler.fit_transform(numerical)

        pd.testing.assert_frame_equal(result, numerical_scaled)
Exemple #2
0
def test_DFStandardScaler_returns_correct_dataframe(numerical):
    numerical_scaled = numerical.copy()
    numerical_scaled['number_a'] = (numerical['number_a'] -
                                    2.5) / 1.118033988749895
    numerical_scaled['number_b'] = (numerical['number_b'] -
                                    6.5) / 1.118033988749895

    scaler = DFStandardScaler()
    result = scaler.fit_transform(numerical)

    pd.testing.assert_frame_equal(result, numerical_scaled)
    def test_standard_scaler_returns_correct_dataframe(
            self, numerical: pd.DataFrame):
        numerical_scaled = numerical.copy()
        numerical_scaled["number_a"] = (numerical["number_a"] -
                                        2.5) / 1.118033988749895
        numerical_scaled["number_b"] = (numerical["number_b"] -
                                        6.5) / 1.118033988749895

        scaler = DFStandardScaler()
        result = scaler.fit_transform(numerical)

        pd.testing.assert_frame_equal(result, numerical_scaled)
Exemple #4
0
def feature_union_classifier() -> Pipeline:
    pipe1 = Pipeline([
        ("select", Select(["sepal length (cm)", "sepal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    pipe2 = Pipeline([
        ("select", Select(["petal length (cm)", "petal width (cm)"])),
        ("scale", DFStandardScaler()),
    ])
    union = DFFeatureUnion(transformer_list=[("pipe1", pipe1), ("pipe2",
                                                                pipe2)])
    return Pipeline([("features", union),
                     ("estimator", LogisticRegression(solver="liblinear"))])
    def test_load_prediction_data_works_as_expected(self):
        dataset = load_demo_dataset("iris")
        dataset.create_train_test(stratify=True)
        feature_pipeline = Pipeline([("scale", DFStandardScaler())])
        model = Model(LogisticRegression(), feature_pipeline=feature_pipeline)
        model.train_estimator(dataset)
        result = model.make_prediction(dataset, 5)

        expected = pd.DataFrame({"Prediction": [0]})
        pd.testing.assert_frame_equal(result, expected, check_dtype=False)
    def test_standard_scaler_works_in_pipeline_with_feature_union(
            self, numerical: pd.DataFrame):
        numerical_scaled = numerical.copy()
        numerical_scaled["number_a"] = (numerical["number_a"] -
                                        2.5) / 1.118033988749895
        numerical_scaled["number_b"] = (numerical["number_b"] -
                                        6.5) / 1.118033988749895

        union = DFFeatureUnion([("number_a", Select(["number_a"])),
                                ("number_b", Select(["number_b"]))])

        pipeline = make_pipeline(union, DFStandardScaler())
        result = pipeline.fit_transform(numerical)

        pd.testing.assert_frame_equal(result, numerical_scaled)
Exemple #7
0
 def test_model_selection_works_with_feature_pipeline(
         self, train_iris_dataset: Dataset):
     estimators = [
         RandomForestClassifier(),
         DummyClassifier(strategy="stratified")
     ]
     feature_pipeline = Pipeline([("scale", DFStandardScaler())])
     best_estimator, results = Model.test_estimators(
         data=train_iris_dataset,
         estimators=estimators,
         feature_pipeline=feature_pipeline,
     )
     expected = Pipeline([("features", feature_pipeline),
                          ("estimator", estimators[0])])
     assert best_estimator.estimator.get_params() == expected.get_params()
Exemple #8
0
def test_DFStandardScaler_works_in_pipeline_with_DFFeatureUnion(
        categorical, numerical):
    numerical_scaled = numerical.copy()
    numerical_scaled['number_a'] = (numerical['number_a'] -
                                    2.5) / 1.118033988749895
    numerical_scaled['number_b'] = (numerical['number_b'] -
                                    6.5) / 1.118033988749895

    union = DFFeatureUnion([('number_a', Select(['number_a'])),
                            ('number_b', Select(['number_b']))])

    pipeline = make_pipeline(
        union,
        DFStandardScaler(),
    )
    result = pipeline.fit_transform(numerical)

    pd.testing.assert_frame_equal(result, numerical_scaled)
Exemple #9
0
    def test_plots_have_correct_title_when_using_pipeline(
            self, dataset: Dataset):
        """
        Expect plots to work correctly with pipelines,
        showing the title of the estimator and not Pipeline
        """
        pipe = Pipeline([
            ("scale", DFStandardScaler()),
            ("clf", RandomForestClassifier(n_estimators=10)),
        ])

        model = Model(pipe)
        result = model.score_estimator(dataset)
        ax = result.plot.permutation_importance()

        assert (ax.title.get_text() ==
                "Permutation Importances (Accuracy) - RandomForestClassifier")

        assert 4 == len(list(ax.get_yticklabels()))
        plt.close()
 def test_standard_scaler_works_in_gridsearch(self, train_iris_dataset):
     grid = create_gridsearch(DFStandardScaler())
     model = Model(grid)
     result = model.score_estimator(train_iris_dataset)
     assert isinstance(result, Result)
 def test_standard_scaler_works_in_cv(self, train_iris_dataset):
     model = create_model(DFStandardScaler())
     result = model.score_estimator(train_iris_dataset, cv=2)
     assert isinstance(result, Result)
 def test_can_reset_scaler_parameters(self):
     scaler = DFStandardScaler()
     scaler.scale_ = 0.7
     scaler.mean_ = 0.5
     scaler._reset()
     assert hasattr(scaler, "scale_") is False
 def test_works_without_args(self):
     assert DFStandardScaler()
 def test_missing_data_can_pass_pipeline(self, missing_data: Dataset):
     pipeline = Pipeline([("scaler", DFStandardScaler())])
     ax = missing_data.plot.missing_data(feature_pipeline=pipeline)
     assert [text.get_text() for text in ax.texts] == ["2.0%"]
     plt.close()
Exemple #15
0
def pipeline_forest_classifier() -> Pipeline:
    pipe = Pipeline([
        ("scale", DFStandardScaler()),
        ("estimator", RandomForestClassifier(n_estimators=10)),
    ])
    return pipe
Exemple #16
0
def pipeline_dummy_classifier() -> Pipeline:
    pipe = Pipeline([
        ("scale", DFStandardScaler()),
        ("estimator", DummyClassifier(strategy="prior")),
    ])
    return pipe