コード例 #1
0
def test_serialization(X_y_binary, ts_data, tmpdir, helper_functions):
    path = os.path.join(str(tmpdir), 'component.pkl')
    for component_class in all_components():
        print('Testing serialization of component {}'.format(component_class.name))
        try:
            component = helper_functions.safe_init_component_with_njobs_1(component_class)
        except EnsembleMissingPipelinesError:
            if (component_class == StackedEnsembleClassifier):
                component = component_class(input_pipelines=[make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY)], n_jobs=1)
            elif (component_class == StackedEnsembleRegressor):
                component = component_class(input_pipelines=[make_pipeline_from_components([RandomForestRegressor()], ProblemTypes.REGRESSION)], n_jobs=1)
        if isinstance(component, Estimator) and ProblemTypes.TIME_SERIES_REGRESSION in component.supported_problem_types:
            X, y = ts_data
        else:
            X, y = X_y_binary

        component.fit(X, y)

        for pickle_protocol in range(cloudpickle.DEFAULT_PROTOCOL + 1):
            component.save(path, pickle_protocol=pickle_protocol)
            loaded_component = ComponentBase.load(path)
            assert component.parameters == loaded_component.parameters
            assert component.describe(return_dict=True) == loaded_component.describe(return_dict=True)
            if (issubclass(component_class, Estimator) and not (isinstance(component, StackedEnsembleClassifier) or isinstance(component, StackedEnsembleRegressor))):
                assert (component.feature_importance == loaded_component.feature_importance).all()
コード例 #2
0
def test_stacked_different_input_pipelines_regression():
    input_pipelines = [
        make_pipeline_from_components([RandomForestRegressor()],
                                      ProblemTypes.REGRESSION),
        make_pipeline_from_components([RandomForestClassifier()],
                                      ProblemTypes.BINARY)
    ]
    with pytest.raises(ValueError,
                       match="All pipelines must have the same problem type."):
        StackedEnsembleRegressor(input_pipelines=input_pipelines)
コード例 #3
0
ファイル: test_pipeline_utils.py プロジェクト: kaidisn/evalml
def test_stacked_estimator_in_pipeline(problem_type, X_y_binary, X_y_multi, X_y_regression,
                                       stackable_classifiers,
                                       stackable_regressors,
                                       logistic_regression_binary_pipeline_class,
                                       logistic_regression_multiclass_pipeline_class,
                                       linear_regression_pipeline_class):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
        base_pipeline_class = BinaryClassificationPipeline
        stacking_component_name = StackedEnsembleClassifier.name
        input_pipelines = [make_pipeline_from_components([classifier], problem_type) for classifier in stackable_classifiers]
        comparison_pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
        objective = 'Log Loss Binary'
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
        base_pipeline_class = MulticlassClassificationPipeline
        stacking_component_name = StackedEnsembleClassifier.name
        input_pipelines = [make_pipeline_from_components([classifier], problem_type) for classifier in stackable_classifiers]
        comparison_pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}})
        objective = 'Log Loss Multiclass'
    elif problem_type == ProblemTypes.REGRESSION:
        X, y = X_y_regression
        base_pipeline_class = RegressionPipeline
        stacking_component_name = StackedEnsembleRegressor.name
        input_pipelines = [make_pipeline_from_components([regressor], problem_type) for regressor in stackable_regressors]
        comparison_pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}})
        objective = 'R2'
    parameters = {
        stacking_component_name: {
            "input_pipelines": input_pipelines,
            "n_jobs": 1
        }
    }
    graph = ['Simple Imputer', stacking_component_name]

    class StackedPipeline(base_pipeline_class):
        component_graph = graph
        model_family = ModelFamily.ENSEMBLE

    pipeline = StackedPipeline(parameters=parameters)
    pipeline.fit(X, y)
    comparison_pipeline.fit(X, y)
    assert not np.isnan(pipeline.predict(X).to_series()).values.any()

    pipeline_score = pipeline.score(X, y, [objective])[objective]
    comparison_pipeline_score = comparison_pipeline.score(X, y, [objective])[objective]

    if problem_type == ProblemTypes.BINARY or problem_type == ProblemTypes.MULTICLASS:
        assert not np.isnan(pipeline.predict_proba(X).to_dataframe()).values.any()
        assert (pipeline_score <= comparison_pipeline_score)
    else:
        assert (pipeline_score >= comparison_pipeline_score)
コード例 #4
0
def test_ensemble_data(mock_fit, mock_score, dummy_binary_pipeline_class,
                       stackable_classifiers):
    X = pd.DataFrame({"a": [i for i in range(100)]})
    y = pd.Series([i % 2 for i in range(100)])
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_batches=19,
                          ensembling=True,
                          _ensembling_split_size=0.25)
    mock_should_continue_callback = MagicMock(return_value=True)
    mock_pre_evaluation_callback = MagicMock()
    mock_post_evaluation_callback = MagicMock()

    training_indices, ensembling_indices, _, _ = split_data(
        ww.DataTable(np.arange(X.shape[0])),
        y,
        problem_type='binary',
        test_size=0.25,
        random_seed=0)
    training_indices, ensembling_indices = training_indices.to_dataframe(
    )[0].tolist(), ensembling_indices.to_dataframe()[0].tolist()

    engine = SequentialEngine(
        X_train=infer_feature_types(X),
        y_train=infer_feature_types(y),
        ensembling_indices=ensembling_indices,
        automl=automl,
        should_continue_callback=mock_should_continue_callback,
        pre_evaluation_callback=mock_pre_evaluation_callback,
        post_evaluation_callback=mock_post_evaluation_callback)
    pipeline1 = [dummy_binary_pipeline_class({'Mock Classifier': {'a': 1}})]
    engine.evaluate_batch(pipeline1)
    # check the fit length is correct, taking into account the data splits
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(training_indices))

    input_pipelines = [
        make_pipeline_from_components([classifier], problem_type='binary')
        for classifier in stackable_classifiers
    ]
    pipeline2 = [
        make_pipeline_from_components(
            [StackedEnsembleClassifier(input_pipelines, n_jobs=1)],
            problem_type='binary',
            custom_name="Stacked Ensemble Classification Pipeline")
    ]
    engine.evaluate_batch(pipeline2)
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 *
                                                len(ensembling_indices))
コード例 #5
0
def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_classifiers, problem_type):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
        num_classes = 2
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
        num_classes = 3

    input_pipelines = [make_pipeline_from_components([classifier], problem_type)
                       for classifier in stackable_classifiers]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, ww.DataColumn)
    assert not np.isnan(y_pred.to_series()).all()

    y_pred_proba = clf.predict_proba(X)
    assert isinstance(y_pred_proba, ww.DataTable)
    assert y_pred_proba.shape == (len(y), num_classes)
    assert not np.isnan(y_pred_proba.to_dataframe()).all().all()

    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, final_estimator=RandomForestClassifier(), n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, ww.DataColumn)
    assert not np.isnan(y_pred.to_series()).all()

    y_pred_proba = clf.predict_proba(X)
    assert y_pred_proba.shape == (len(y), num_classes)
    assert isinstance(y_pred_proba, ww.DataTable)
    assert not np.isnan(y_pred_proba.to_dataframe()).all().all()
コード例 #6
0
ファイル: test_utils.py プロジェクト: actuarial-tools/evalml
def test_scikit_learn_wrapper(X_y_binary, X_y_multi, X_y_regression):
    for estimator in [
            estimator for estimator in _all_estimators()
            if estimator.model_family != ModelFamily.ENSEMBLE
    ]:
        for problem_type in estimator.supported_problem_types:
            if problem_type == ProblemTypes.BINARY:
                X, y = X_y_binary
                num_classes = 2
            elif problem_type == ProblemTypes.MULTICLASS:
                X, y = X_y_multi
                num_classes = 3
            elif problem_type == ProblemTypes.REGRESSION:
                X, y = X_y_regression
            elif problem_type in [
                    ProblemTypes.TIME_SERIES_REGRESSION,
                    ProblemTypes.TIME_SERIES_MULTICLASS,
                    ProblemTypes.TIME_SERIES_BINARY
            ]:
                # Skipping because make_pipeline_from_components does not yet work for time series.
                continue

            evalml_pipeline = make_pipeline_from_components([estimator()],
                                                            problem_type)
            scikit_estimator = scikit_learn_wrapped_estimator(evalml_pipeline)
            scikit_estimator.fit(X, y)
            y_pred = scikit_estimator.predict(X)
            assert len(y_pred) == len(y)
            assert not np.isnan(y_pred).all()
            if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
                y_pred_proba = scikit_estimator.predict_proba(X)
                assert y_pred_proba.shape == (len(y), num_classes)
                assert not np.isnan(y_pred_proba).all().all()
コード例 #7
0
ファイル: test_utils.py プロジェクト: actuarial-tools/evalml
def test_scikit_learn_wrapper_invalid_problem_type():
    evalml_pipeline = make_pipeline_from_components([RandomForestClassifier()],
                                                    ProblemTypes.MULTICLASS)
    evalml_pipeline.problem_type = None
    with pytest.raises(
            ValueError,
            match="Could not wrap EvalML object in scikit-learn wrapper."):
        scikit_learn_wrapped_estimator(evalml_pipeline)
コード例 #8
0
def test_stacked_feature_importance(mock_fit, X_y_regression, stackable_regressors):
    X, y = X_y_regression
    input_pipelines = [make_pipeline_from_components([regressor], ProblemTypes.REGRESSION)
                       for regressor in stackable_regressors]
    clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    mock_fit.assert_called()
    clf._is_fitted = True
    with pytest.raises(NotImplementedError, match="feature_importance is not implemented"):
        clf.feature_importance
コード例 #9
0
def test_stacked_ensemble_nonstackable_model_families():
    with pytest.raises(
            ValueError,
            match=
            "Pipelines with any of the following model families cannot be used as base pipelines"
    ):
        StackedEnsembleRegressor(input_pipelines=[
            make_pipeline_from_components([BaselineRegressor()],
                                          ProblemTypes.REGRESSION)
        ])
コード例 #10
0
def test_generate_code_errors():
    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY))

    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(LinearRegressor)

    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(Imputer)

    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(ComponentBase)
コード例 #11
0
def test_stacked_feature_importance(mock_fit, X_y_binary, X_y_multi, stackable_classifiers, problem_type):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
    input_pipelines = [make_pipeline_from_components([classifier], problem_type)
                       for classifier in stackable_classifiers]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    mock_fit.assert_called()
    clf._is_fitted = True
    with pytest.raises(NotImplementedError, match="feature_importance is not implemented"):
        clf.feature_importance
コード例 #12
0
ファイル: test_engine_base.py プロジェクト: sujala/evalml
def test_evaluate_pipeline_handles_ensembling_indices(mock_fit, mock_score, dummy_binary_pipeline_class,
                                                      stackable_classifiers):
    X = ww.DataTable(pd.DataFrame({"a": [i for i in range(100)]}))
    y = ww.DataColumn(pd.Series([i % 2 for i in range(100)]))

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=19, ensembling=True, _ensembling_split_size=0.25)

    training_indices, ensembling_indices, _, _ = split_data(ww.DataTable(np.arange(X.shape[0])), y, problem_type='binary', test_size=0.25, random_seed=0)
    training_indices, ensembling_indices = training_indices.to_dataframe()[0].tolist(), ensembling_indices.to_dataframe()[0].tolist()

    pipeline1 = dummy_binary_pipeline_class({'Mock Classifier': {'a': 1}})

    _ = evaluate_pipeline(pipeline1, automl, X, y, logger=MagicMock())
    # check the fit length is correct, taking into account the data splits
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(training_indices))

    input_pipelines = [make_pipeline_from_components([classifier], problem_type='binary')
                       for classifier in stackable_classifiers]

    pipeline2 = make_pipeline_from_components([StackedEnsembleClassifier(input_pipelines, n_jobs=1)],
                                              problem_type='binary',
                                              custom_name="Stacked Ensemble Classification Pipeline")
    _ = evaluate_pipeline(pipeline2, automl, X, y, logger=MagicMock())
    assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(ensembling_indices))
コード例 #13
0
def test_stacked_fit_predict_regression(X_y_regression, stackable_regressors):
    X, y = X_y_regression
    input_pipelines = [make_pipeline_from_components([regressor], ProblemTypes.REGRESSION)
                       for regressor in stackable_regressors]
    clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, pd.Series)
    assert not np.isnan(y_pred).all()

    clf = StackedEnsembleRegressor(input_pipelines=input_pipelines, final_estimator=RandomForestRegressor(), n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, pd.Series)
    assert not np.isnan(y_pred).all()
コード例 #14
0
ファイル: test_pipeline_utils.py プロジェクト: kaidisn/evalml
def test_make_pipeline_from_components(X_y_binary, logistic_regression_binary_pipeline_class):
    with pytest.raises(ValueError, match="Pipeline needs to have an estimator at the last position of the component list"):
        make_pipeline_from_components([Imputer()], problem_type='binary')

    with pytest.raises(KeyError, match="Problem type 'invalid_type' does not exist"):
        make_pipeline_from_components([RandomForestClassifier()], problem_type='invalid_type')

    with pytest.raises(TypeError, match="Custom pipeline name must be a string"):
        make_pipeline_from_components([RandomForestClassifier()], problem_type='binary', custom_name=True)

    with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"):
        make_pipeline_from_components([RandomForestClassifier], problem_type='binary')

    with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"):
        make_pipeline_from_components(['RandomForestClassifier'], problem_type='binary')

    imp = Imputer(numeric_impute_strategy='median', random_seed=5)
    est = RandomForestClassifier(random_seed=7)
    pipeline = make_pipeline_from_components([imp, est], ProblemTypes.BINARY, custom_name='My Pipeline',
                                             random_seed=15)
    assert [c.__class__ for c in pipeline] == [Imputer, RandomForestClassifier]
    assert [(c.random_seed == 15) for c in pipeline]
    assert pipeline.problem_type == ProblemTypes.BINARY
    assert pipeline.custom_name == 'My Pipeline'
    expected_parameters = {
        'Imputer': {
            'categorical_impute_strategy': 'most_frequent',
            'numeric_impute_strategy': 'median',
            'categorical_fill_value': None,
            'numeric_fill_value': None},
        'Random Forest Classifier': {
            'n_estimators': 100,
            'max_depth': 6,
            'n_jobs': -1}
    }
    assert pipeline.parameters == expected_parameters
    assert pipeline.random_seed == 15

    class DummyEstimator(Estimator):
        name = "Dummy!"
        model_family = "foo"
        supported_problem_types = [ProblemTypes.BINARY]
        parameters = {'bar': 'baz'}
    random_seed = 42
    pipeline = make_pipeline_from_components([DummyEstimator(random_seed=3)], ProblemTypes.BINARY,
                                             random_seed=random_seed)
    components_list = [c for c in pipeline]
    assert len(components_list) == 1
    assert isinstance(components_list[0], DummyEstimator)
    assert components_list[0].random_seed == random_seed
    expected_parameters = {'Dummy!': {'bar': 'baz'}}
    assert pipeline.parameters == expected_parameters
    assert pipeline.random_seed == random_seed

    X, y = X_y_binary
    pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}},
                                                         random_seed=42)
    component_instances = [c for c in pipeline]
    new_pipeline = make_pipeline_from_components(component_instances, ProblemTypes.BINARY)
    pipeline.fit(X, y)
    predictions = pipeline.predict(X)
    new_pipeline.fit(X, y)
    new_predictions = new_pipeline.predict(X)
    assert np.array_equal(predictions, new_predictions)
    assert np.array_equal(pipeline.feature_importance, new_pipeline.feature_importance)
    assert new_pipeline.name == 'Templated Pipeline'
    assert pipeline.parameters == new_pipeline.parameters
    for component, new_component in zip(pipeline._component_graph, new_pipeline._component_graph):
        assert isinstance(new_component, type(component))
    assert pipeline.describe() == new_pipeline.describe()
コード例 #15
0
def test_score_batch_works(mock_score, pipeline_score_side_effect, X_y_binary,
                           dummy_binary_pipeline_class, stackable_classifiers,
                           caplog):

    exceptions_to_check = []
    expected_scores = {}
    for i, e in enumerate(pipeline_score_side_effect):
        # Ensemble pipeline has different name
        pipeline_name = f"Pipeline {i}" if i < len(
            pipeline_score_side_effect) - 1 else "Templated Pipeline"
        scores = no_exception_scores
        if isinstance(e, PipelineScoreError):
            scores = {"F1": np.nan, "AUC": np.nan, "Log Loss Binary": np.nan}
            scores.update(e.scored_successfully)
            exceptions_to_check.append(f"Score error for {pipeline_name}")

        expected_scores[pipeline_name] = scores

    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=1,
                          allowed_pipelines=[dummy_binary_pipeline_class])

    engine = SequentialEngine(X_train=automl.X_train,
                              y_train=automl.y_train,
                              automl=automl)

    def make_pipeline_name(index):
        class DummyPipeline(dummy_binary_pipeline_class):
            custom_name = f"Pipeline {index}"

        return DummyPipeline({'Mock Classifier': {'a': index}})

    pipelines = [
        make_pipeline_name(i)
        for i in range(len(pipeline_score_side_effect) - 1)
    ]
    ensemble_input_pipelines = [
        make_pipeline_from_components([classifier], problem_type="binary")
        for classifier in stackable_classifiers[:2]
    ]
    ensemble = make_pipeline_from_components(
        [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)],
        problem_type="binary")
    pipelines.append(ensemble)

    def score_batch_and_check():
        caplog.clear()
        with patch('evalml.pipelines.BinaryClassificationPipeline.score'
                   ) as mock_score:
            mock_score.side_effect = pipeline_score_side_effect

            scores = engine.score_batch(
                pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"])
            assert scores == expected_scores
            for exception in exceptions_to_check:
                assert exception in caplog.text

    # Test scoring before search
    score_batch_and_check()

    automl.search()

    # Test scoring after search
    score_batch_and_check()
コード例 #16
0
def test_train_batch_works(mock_score, pipeline_fit_side_effect, X_y_binary,
                           dummy_binary_pipeline_class, stackable_classifiers,
                           caplog):

    exceptions_to_check = [
        str(e) for e in pipeline_fit_side_effect if isinstance(e, Exception)
    ]

    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_time=1,
                          max_iterations=2,
                          train_best_pipeline=False,
                          n_jobs=1)

    engine = SequentialEngine(X_train=automl.X_train,
                              y_train=automl.y_train,
                              automl=automl)

    def make_pipeline_name(index):
        class DummyPipeline(dummy_binary_pipeline_class):
            custom_name = f"Pipeline {index}"

        return DummyPipeline({'Mock Classifier': {'a': index}})

    pipelines = [
        make_pipeline_name(i)
        for i in range(len(pipeline_fit_side_effect) - 1)
    ]
    ensemble_input_pipelines = [
        make_pipeline_from_components([classifier], problem_type="binary")
        for classifier in stackable_classifiers[:2]
    ]
    ensemble = make_pipeline_from_components(
        [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)],
        problem_type="binary")
    pipelines.append(ensemble)

    def train_batch_and_check():
        caplog.clear()
        with patch('evalml.pipelines.BinaryClassificationPipeline.fit'
                   ) as mock_fit:
            mock_fit.side_effect = pipeline_fit_side_effect

            trained_pipelines = engine.train_batch(pipelines)

            assert len(trained_pipelines) == len(
                pipeline_fit_side_effect) - len(exceptions_to_check)
            assert mock_fit.call_count == len(pipeline_fit_side_effect)
            for exception in exceptions_to_check:
                assert exception in caplog.text

    # Test training before search is run
    train_batch_and_check()

    # Test training after search.
    automl.search()

    train_batch_and_check()