Beispiel #1
0
def test_stacked_ensemble_init_with_invalid_estimators_parameter():
    with pytest.raises(EnsembleMissingPipelinesError,
                       match='must not be None or an empty list.'):
        StackedEnsembleClassifier()
    with pytest.raises(EnsembleMissingPipelinesError,
                       match='must not be None or an empty list.'):
        StackedEnsembleClassifier(input_pipelines=[])
def test_stacked_feature_importance(mock_fit, X_y_binary, X_y_multi, stackable_classifiers, problem_type):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
    input_pipelines = [make_pipeline_from_components([classifier], problem_type)
                       for classifier in stackable_classifiers]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    mock_fit.assert_called()
    clf._is_fitted = True
    with pytest.raises(NotImplementedError, match="feature_importance is not implemented"):
        clf.feature_importance
def test_stacked_ensemble_n_jobs_negative_one(X_y_binary, logistic_regression_binary_pipeline_class):
    X, y = X_y_binary
    input_pipelines = [logistic_regression_binary_pipeline_class(parameters={})]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=-1)
    expected_parameters = {
        "input_pipelines": input_pipelines,
        "final_estimator": None,
        'cv': None,
        'n_jobs': -1
    }
    assert clf.parameters == expected_parameters
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert not np.isnan(y_pred.to_series()).all()
def test_stacked_ensemble_does_not_overwrite_pipeline_random_state(mock_stack,
                                                                   logistic_regression_binary_pipeline_class):
    input_pipelines = [logistic_regression_binary_pipeline_class(parameters={}, random_state=3),
                       logistic_regression_binary_pipeline_class(parameters={}, random_state=4)]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, random_state=5, n_jobs=1)
    estimators_used_in_ensemble = mock_stack.call_args[1]['estimators']
    assert clf.random_state == 5
    assert estimators_used_in_ensemble[0][1].pipeline.random_state == 3
    assert estimators_used_in_ensemble[1][1].pipeline.random_state == 4
Beispiel #5
0
def test_stacked_different_input_pipelines_classification():
    input_pipelines = [
        make_pipeline_from_components([RandomForestClassifier()],
                                      ProblemTypes.MULTICLASS),
        make_pipeline_from_components([RandomForestClassifier()],
                                      ProblemTypes.BINARY)
    ]
    with pytest.raises(ValueError,
                       match="All pipelines must have the same problem type."):
        StackedEnsembleClassifier(input_pipelines=input_pipelines)
Beispiel #6
0
def test_stacked_ensemble_nonstackable_model_families():
    with pytest.raises(
            ValueError,
            match=
            "Pipelines with any of the following model families cannot be used as base pipelines"
    ):
        StackedEnsembleClassifier(input_pipelines=[
            make_pipeline_from_components([BaselineClassifier()],
                                          ProblemTypes.BINARY)
        ])
def test_stacked_ensemble_init_with_multiple_same_estimators(X_y_binary, logistic_regression_binary_pipeline_class):
    # Checks that it is okay to pass multiple of the same type of estimator
    X, y = X_y_binary
    input_pipelines = [logistic_regression_binary_pipeline_class(parameters={}),
                       logistic_regression_binary_pipeline_class(parameters={})]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1)
    expected_parameters = {
        "input_pipelines": input_pipelines,
        "final_estimator": None,
        'cv': None,
        'n_jobs': 1
    }
    assert clf.parameters == expected_parameters

    fitted = clf.fit(X, y)
    assert isinstance(fitted, StackedEnsembleClassifier)

    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert not np.isnan(y_pred.to_series()).all()
Beispiel #8
0
def test_ensemble_data(mock_fit, mock_score, dummy_binary_pipeline_class,
                       stackable_classifiers):
    X = pd.DataFrame({"a": [i for i in range(100)]})
    y = pd.Series([i % 2 for i in range(100)])
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_batches=19,
                          ensembling=True,
                          _ensembling_split_size=0.25)
    mock_should_continue_callback = MagicMock(return_value=True)
    mock_pre_evaluation_callback = MagicMock()
    mock_post_evaluation_callback = MagicMock()

    training_indices, ensembling_indices, _, _ = split_data(
        ww.DataTable(np.arange(X.shape[0])),
        y,
        problem_type='binary',
        test_size=0.25,
        random_seed=0)
    training_indices, ensembling_indices = training_indices.to_dataframe(
    )[0].tolist(), ensembling_indices.to_dataframe()[0].tolist()

    engine = SequentialEngine(
        X_train=infer_feature_types(X),
        y_train=infer_feature_types(y),
        ensembling_indices=ensembling_indices,
        automl=automl,
        should_continue_callback=mock_should_continue_callback,
        pre_evaluation_callback=mock_pre_evaluation_callback,
        post_evaluation_callback=mock_post_evaluation_callback)
    pipeline1 = [dummy_binary_pipeline_class({'Mock Classifier': {'a': 1}})]
    engine.evaluate_batch(pipeline1)
    # check the fit length is correct, taking into account the data splits
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(training_indices))

    input_pipelines = [
        make_pipeline_from_components([classifier], problem_type='binary')
        for classifier in stackable_classifiers
    ]
    pipeline2 = [
        make_pipeline_from_components(
            [StackedEnsembleClassifier(input_pipelines, n_jobs=1)],
            problem_type='binary',
            custom_name="Stacked Ensemble Classification Pipeline")
    ]
    engine.evaluate_batch(pipeline2)
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 *
                                                len(ensembling_indices))
def test_stacked_ensemble_multilevel(logistic_regression_binary_pipeline_class):
    # checks passing a stacked ensemble classifier as a final estimator
    X = pd.DataFrame(np.random.rand(50, 5))
    y = pd.Series([1, 0] * 25)
    base = StackedEnsembleClassifier(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})], n_jobs=1)
    clf = StackedEnsembleClassifier(input_pipelines=[logistic_regression_binary_pipeline_class(parameters={})],
                                    final_estimator=base,
                                    n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert not np.isnan(y_pred.to_series()).all()
def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_classifiers, problem_type):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
        num_classes = 2
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
        num_classes = 3

    input_pipelines = [make_pipeline_from_components([classifier], problem_type)
                       for classifier in stackable_classifiers]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, ww.DataColumn)
    assert not np.isnan(y_pred.to_series()).all()

    y_pred_proba = clf.predict_proba(X)
    assert isinstance(y_pred_proba, ww.DataTable)
    assert y_pred_proba.shape == (len(y), num_classes)
    assert not np.isnan(y_pred_proba.to_dataframe()).all().all()

    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, final_estimator=RandomForestClassifier(), n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, ww.DataColumn)
    assert not np.isnan(y_pred.to_series()).all()

    y_pred_proba = clf.predict_proba(X)
    assert y_pred_proba.shape == (len(y), num_classes)
    assert isinstance(y_pred_proba, ww.DataTable)
    assert not np.isnan(y_pred_proba.to_dataframe()).all().all()
Beispiel #11
0
def test_score_batch_works(mock_score, pipeline_score_side_effect, X_y_binary,
                           dummy_binary_pipeline_class, stackable_classifiers,
                           caplog):

    exceptions_to_check = []
    expected_scores = {}
    for i, e in enumerate(pipeline_score_side_effect):
        # Ensemble pipeline has different name
        pipeline_name = f"Pipeline {i}" if i < len(
            pipeline_score_side_effect) - 1 else "Templated Pipeline"
        scores = no_exception_scores
        if isinstance(e, PipelineScoreError):
            scores = {"F1": np.nan, "AUC": np.nan, "Log Loss Binary": np.nan}
            scores.update(e.scored_successfully)
            exceptions_to_check.append(f"Score error for {pipeline_name}")

        expected_scores[pipeline_name] = scores

    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=1,
                          allowed_pipelines=[dummy_binary_pipeline_class])

    engine = SequentialEngine(X_train=automl.X_train,
                              y_train=automl.y_train,
                              automl=automl)

    def make_pipeline_name(index):
        class DummyPipeline(dummy_binary_pipeline_class):
            custom_name = f"Pipeline {index}"

        return DummyPipeline({'Mock Classifier': {'a': index}})

    pipelines = [
        make_pipeline_name(i)
        for i in range(len(pipeline_score_side_effect) - 1)
    ]
    ensemble_input_pipelines = [
        make_pipeline_from_components([classifier], problem_type="binary")
        for classifier in stackable_classifiers[:2]
    ]
    ensemble = make_pipeline_from_components(
        [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)],
        problem_type="binary")
    pipelines.append(ensemble)

    def score_batch_and_check():
        caplog.clear()
        with patch('evalml.pipelines.BinaryClassificationPipeline.score'
                   ) as mock_score:
            mock_score.side_effect = pipeline_score_side_effect

            scores = engine.score_batch(
                pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"])
            assert scores == expected_scores
            for exception in exceptions_to_check:
                assert exception in caplog.text

    # Test scoring before search
    score_batch_and_check()

    automl.search()

    # Test scoring after search
    score_batch_and_check()
Beispiel #12
0
def test_train_batch_works(mock_score, pipeline_fit_side_effect, X_y_binary,
                           dummy_binary_pipeline_class, stackable_classifiers,
                           caplog):

    exceptions_to_check = [
        str(e) for e in pipeline_fit_side_effect if isinstance(e, Exception)
    ]

    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_time=1,
                          max_iterations=2,
                          train_best_pipeline=False,
                          n_jobs=1)

    engine = SequentialEngine(X_train=automl.X_train,
                              y_train=automl.y_train,
                              automl=automl)

    def make_pipeline_name(index):
        class DummyPipeline(dummy_binary_pipeline_class):
            custom_name = f"Pipeline {index}"

        return DummyPipeline({'Mock Classifier': {'a': index}})

    pipelines = [
        make_pipeline_name(i)
        for i in range(len(pipeline_fit_side_effect) - 1)
    ]
    ensemble_input_pipelines = [
        make_pipeline_from_components([classifier], problem_type="binary")
        for classifier in stackable_classifiers[:2]
    ]
    ensemble = make_pipeline_from_components(
        [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)],
        problem_type="binary")
    pipelines.append(ensemble)

    def train_batch_and_check():
        caplog.clear()
        with patch('evalml.pipelines.BinaryClassificationPipeline.fit'
                   ) as mock_fit:
            mock_fit.side_effect = pipeline_fit_side_effect

            trained_pipelines = engine.train_batch(pipelines)

            assert len(trained_pipelines) == len(
                pipeline_fit_side_effect) - len(exceptions_to_check)
            assert mock_fit.call_count == len(pipeline_fit_side_effect)
            for exception in exceptions_to_check:
                assert exception in caplog.text

    # Test training before search is run
    train_batch_and_check()

    # Test training after search.
    automl.search()

    train_batch_and_check()
Beispiel #13
0
def test_stacked_different_input_pipelines_classification():
    input_pipelines = [BinaryClassificationPipeline([RandomForestClassifier]),
                       MulticlassClassificationPipeline([RandomForestClassifier])]
    with pytest.raises(ValueError, match="All pipelines must have the same problem type."):
        StackedEnsembleClassifier(input_pipelines=input_pipelines)
Beispiel #14
0
def test_stacked_ensemble_nonstackable_model_families():
    with pytest.raises(ValueError, match="Pipelines with any of the following model families cannot be used as base pipelines"):
        StackedEnsembleClassifier(input_pipelines=[BinaryClassificationPipeline([BaselineClassifier])])