Ejemplo n.º 1
0
def test_lead_scoring_objective(X_y_binary):
    X, y = X_y_binary

    objective = LeadScoring(true_positives=1, false_positives=-1)

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective=objective,
                          max_iterations=1,
                          random_seed=0)
    automl.search()
    pipeline = automl.best_pipeline
    pipeline.fit(X, y)
    pipeline.predict(X)
    pipeline.predict_proba(X)
    pipeline.score(X, y, [objective])

    predicted = pd.Series([1, 10, .5, 5])
    out = objective.decision_function(predicted, 1)
    y_true = pd.Series([False, True, False, True])
    assert out.tolist() == [False, True, False, True]

    predicted = np.array([1, 10, .5, 5])
    out = objective.decision_function(predicted, 1)
    assert out.tolist() == y_true.to_list()

    score = objective.score(out, y_true)
    assert (score == 0.5)
Ejemplo n.º 2
0
def test_callback(X_y_regression):
    X, y = X_y_regression

    counts = {
        "start_iteration_callback": 0,
        "add_result_callback": 0,
    }

    def start_iteration_callback(pipeline_class,
                                 parameters,
                                 automl_obj,
                                 counts=counts):
        counts["start_iteration_callback"] += 1

    def add_result_callback(results,
                            trained_pipeline,
                            automl_obj,
                            counts=counts):
        counts["add_result_callback"] += 1

    max_iterations = 3
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          objective="R2",
                          max_iterations=max_iterations,
                          start_iteration_callback=start_iteration_callback,
                          add_result_callback=add_result_callback,
                          n_jobs=1)
    automl.search()

    assert counts["start_iteration_callback"] == max_iterations
    assert counts["add_result_callback"] == max_iterations
Ejemplo n.º 3
0
def test_automl_time_series_classification_pickle_generated_pipeline(
        mock_binary_fit, mock_multi_fit, mock_binary_score,
        mock_multiclass_score, problem_type, X_y_binary, X_y_multi):
    if problem_type == ProblemTypes.TIME_SERIES_BINARY:
        X, y = X_y_binary
        pipeline = GeneratedPipelineTimeSeriesBinary
    else:
        X, y = X_y_multi
        pipeline = GeneratedPipelineTimeSeriesMulticlass

    configuration = {
        "gap": 0,
        "max_delay": 0,
        'delay_target': False,
        'delay_features': True
    }
    a = AutoMLSearch(X_train=X,
                     y_train=y,
                     problem_type=problem_type,
                     problem_configuration=configuration)
    a.search()

    for i, row in a.rankings.iterrows():
        assert a.get_pipeline(row['id']).__class__ == pipeline
        assert pickle.loads(pickle.dumps(a.get_pipeline(row['id'])))
def test_non_optimizable_threshold_multi(mock_fit, mock_score, X_y_multi):
    mock_score.return_value = {"Log Loss Multiclass": 0.5}
    X, y = X_y_multi
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          objective='Log Loss Multiclass',
                          max_iterations=1)
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    with pytest.raises(AttributeError):
        automl.best_pipeline.threshold

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          objective='Log Loss Multiclass',
                          max_iterations=1,
                          optimize_thresholds=True)
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    with pytest.raises(AttributeError):
        automl.best_pipeline.threshold
def test_data_splitter(X_y_binary):
    X, y = X_y_binary
    cv_folds = 5
    automl = AutoMLSearch(
        X_train=X,
        y_train=y,
        problem_type='binary',
        data_splitter=BalancedClassificationDataCVSplit(n_splits=cv_folds),
        max_iterations=1,
        n_jobs=1)
    automl.search()

    assert isinstance(automl.rankings, pd.DataFrame)
    assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          data_splitter=TimeSeriesSplit(n_splits=cv_folds),
                          max_iterations=1,
                          n_jobs=1)
    automl.search()

    assert isinstance(automl.rankings, pd.DataFrame)
    assert len(automl.results['pipeline_results'][0]["cv_data"]) == cv_folds
def test_automl_allowed_pipelines_init_allowed_both_not_specified_multi(
        mock_fit, mock_score, X_y_multi,
        assert_allowed_pipelines_equal_helper):
    X, y = X_y_multi
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='multiclass',
                          allowed_pipelines=None,
                          allowed_model_families=None)
    mock_score.return_value = {automl.objective.name: 1.0}
    expected_pipelines = [
        make_pipeline(X, y, estimator, ProblemTypes.MULTICLASS)
        for estimator in get_estimators(ProblemTypes.MULTICLASS,
                                        model_families=None)
    ]
    assert_allowed_pipelines_equal_helper(automl.allowed_pipelines,
                                          expected_pipelines)

    automl.search()
    assert_allowed_pipelines_equal_helper(automl.allowed_pipelines,
                                          expected_pipelines)
    assert set(automl.allowed_model_families) == set(
        [p.model_family for p in expected_pipelines])
    mock_fit.assert_called()
    mock_score.assert_called()
def test_plot_iterations_ipython_mock_import_failure(mock_ipython_display,
                                                     X_y_binary):
    pytest.importorskip(
        'IPython.display',
        reason='Skipping plotting test because ipywidgets not installed')
    go = pytest.importorskip(
        'plotly.graph_objects',
        reason='Skipping plotting test because plotly not installed')
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective="f1",
                          max_iterations=3,
                          n_jobs=1)
    automl.search()

    mock_ipython_display.side_effect = ImportError('KABOOOOOOMMMM')
    plot = automl.plot.search_iteration_plot(interactive_plot=True)
    mock_ipython_display.assert_called_once()

    assert isinstance(plot, go.Figure)
    assert isinstance(plot.data, tuple)
    plot_data = plot.data[0]
    x = pd.Series(plot_data['x'])
    y = pd.Series(plot_data['y'])
    assert x.is_monotonic_increasing
    assert y.is_monotonic_increasing
    assert len(x) == 3
    assert len(y) == 3
def test_automl_supports_time_series_classification(mock_binary_fit, mock_multi_fit, mock_binary_score, mock_multiclass_score,
                                                    problem_type, X_y_binary, X_y_multi):
    if problem_type == ProblemTypes.TIME_SERIES_BINARY:
        X, y = X_y_binary
        baseline = TimeSeriesBaselineBinaryPipeline
        mock_binary_score.return_value = {"Log Loss Binary": 0.2}
        problem_type = 'time series binary'
    else:
        X, y = X_y_multi
        baseline = TimeSeriesBaselineMulticlassPipeline
        mock_multiclass_score.return_value = {"Log Loss Multiclass": 0.25}
        problem_type = 'time series multiclass'

    configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True}

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type,
                          problem_configuration=configuration,
                          max_batches=2)
    automl.search()
    assert isinstance(automl.data_splitter, TimeSeriesSplit)
    for result in automl.results['pipeline_results'].values():
        if result["id"] == 0:
            assert result['pipeline_class'] == baseline
            continue

        assert result['parameters']['Delayed Feature Transformer'] == configuration
        assert result['parameters']['pipeline'] == configuration
def test_optimizable_threshold_enabled(mock_fit, mock_score,
                                       mock_predict_proba, mock_encode_targets,
                                       mock_optimize_threshold, X_y_binary,
                                       caplog):
    mock_optimize_threshold.return_value = 0.8
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective='precision',
                          max_iterations=1,
                          optimize_thresholds=True)
    mock_score.return_value = {'precision': 1.0}
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    mock_predict_proba.assert_called()
    mock_optimize_threshold.assert_called()
    assert automl.best_pipeline.threshold == 0.8
    assert automl.results['pipeline_results'][0]['cv_data'][0].get(
        'binary_classification_threshold') == 0.8
    assert automl.results['pipeline_results'][0]['cv_data'][1].get(
        'binary_classification_threshold') == 0.8
    assert automl.results['pipeline_results'][0]['cv_data'][2].get(
        'binary_classification_threshold') == 0.8

    automl.describe_pipeline(0)
    out = caplog.text
    assert "Objective to optimize binary classification pipeline thresholds for" in out
def test_optimizable_threshold_disabled(mock_fit, mock_score,
                                        mock_predict_proba,
                                        mock_encode_targets,
                                        mock_optimize_threshold, X_y_binary):
    mock_optimize_threshold.return_value = 0.8
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective='precision',
                          max_iterations=1,
                          optimize_thresholds=False)
    mock_score.return_value = {automl.objective.name: 1.0}
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    assert not mock_predict_proba.called
    assert not mock_optimize_threshold.called
    assert automl.best_pipeline.threshold == 0.5
    assert automl.results['pipeline_results'][0]['cv_data'][0].get(
        'binary_classification_threshold') == 0.5
    assert automl.results['pipeline_results'][0]['cv_data'][1].get(
        'binary_classification_threshold') == 0.5
    assert automl.results['pipeline_results'][0]['cv_data'][2].get(
        'binary_classification_threshold') == 0.5
def test_init(X_y_binary):
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=1,
                          n_jobs=1)
    automl.search()

    assert automl.n_jobs == 1
    assert isinstance(automl.rankings, pd.DataFrame)
    assert isinstance(automl.best_pipeline, PipelineBase)
    automl.best_pipeline.predict(X)

    # test with dataframes
    automl = AutoMLSearch(pd.DataFrame(X),
                          pd.Series(y),
                          problem_type='binary',
                          max_iterations=1,
                          n_jobs=1)
    automl.search()

    assert isinstance(automl.rankings, pd.DataFrame)
    assert isinstance(automl.full_rankings, pd.DataFrame)
    assert isinstance(automl.best_pipeline, PipelineBase)
    assert isinstance(automl.get_pipeline(0), PipelineBase)
    assert automl.objective.name == 'Log Loss Binary'
    automl.best_pipeline.predict(X)
def test_callback(X_y_binary):
    X, y = X_y_binary

    counts = {
        "start_iteration_callback": 0,
        "add_result_callback": 0,
    }

    def start_iteration_callback(pipeline_class,
                                 parameters,
                                 automl_obj,
                                 counts=counts):
        counts["start_iteration_callback"] += 1

    def add_result_callback(results,
                            trained_pipeline,
                            automl_obj,
                            counts=counts):
        counts["add_result_callback"] += 1

    max_iterations = 3
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective=Precision(),
                          max_iterations=max_iterations,
                          start_iteration_callback=start_iteration_callback,
                          add_result_callback=add_result_callback,
                          n_jobs=1)
    automl.search()

    assert counts["start_iteration_callback"] == len(
        get_estimators('binary')) + 1
    assert counts["add_result_callback"] == max_iterations
def test_automl_time_series_classification_threshold(mock_binary_fit, mock_binary_score, mock_predict_proba, mock_optimize_threshold, mock_split_data,
                                                     optimize, objective, X_y_binary):
    X, y = X_y_binary
    mock_binary_score.return_value = {objective: 0.4}
    problem_type = 'time series binary'

    configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True}

    mock_optimize_threshold.return_value = 0.62
    mock_split_data.return_value = split_data(X, y, problem_type, test_size=0.2, random_state=0)
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type,
                          problem_configuration=configuration, objective=objective, optimize_thresholds=optimize,
                          max_batches=2)
    automl.search()
    assert isinstance(automl.data_splitter, TimeSeriesSplit)
    if objective == 'Log Loss Binary':
        mock_optimize_threshold.assert_not_called()
        assert automl.best_pipeline.threshold is None
        mock_split_data.assert_not_called()
    elif optimize and objective == 'F1':
        mock_optimize_threshold.assert_called()
        assert automl.best_pipeline.threshold == 0.62
        mock_split_data.assert_called()
        assert str(mock_split_data.call_args[0][2]) == problem_type
    elif not optimize and objective == 'F1':
        mock_optimize_threshold.assert_not_called()
        assert automl.best_pipeline.threshold == 0.5
        mock_split_data.assert_not_called()
Ejemplo n.º 14
0
def test_init(X_y_regression):
    X, y = X_y_regression

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          objective="R2",
                          max_iterations=3,
                          n_jobs=1)
    automl.search()

    assert automl.n_jobs == 1
    assert isinstance(automl.rankings, pd.DataFrame)
    assert isinstance(automl.best_pipeline, PipelineBase)
    automl.best_pipeline.predict(X)

    # test with dataframes
    automl = AutoMLSearch(pd.DataFrame(X),
                          pd.Series(y),
                          problem_type='regression',
                          objective="R2",
                          max_iterations=3,
                          n_jobs=1)
    automl.search()

    assert isinstance(automl.rankings, pd.DataFrame)
    assert isinstance(automl.full_rankings, pd.DataFrame)
    assert isinstance(automl.best_pipeline, PipelineBase)
    automl.best_pipeline.predict(X)
    assert isinstance(automl.get_pipeline(0), PipelineBase)
Ejemplo n.º 15
0
def test_automl_supports_time_series_regression(mock_fit, mock_score,
                                                X_y_regression):
    X, y = X_y_regression

    configuration = {
        "gap": 0,
        "max_delay": 0,
        'delay_target': False,
        'delay_features': True
    }

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type="time series regression",
                          problem_configuration=configuration,
                          max_batches=2)
    automl.search()
    assert isinstance(automl.data_splitter, TimeSeriesSplit)
    for result in automl.results['pipeline_results'].values():
        if result["id"] == 0:
            assert result[
                'pipeline_class'] == TimeSeriesBaselineRegressionPipeline
            continue

        assert result['parameters'][
            'Delayed Feature Transformer'] == configuration
        assert result['parameters']['pipeline'] == configuration
Ejemplo n.º 16
0
def test_automl_allowed_pipelines_no_allowed_pipelines(automl_type, X_y_binary, X_y_multi):
    is_multiclass = automl_type == ProblemTypes.MULTICLASS
    X, y = X_y_multi if is_multiclass else X_y_binary
    problem_type = 'multiclass' if is_multiclass else 'binary'
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, allowed_pipelines=None, allowed_model_families=[])
    assert automl.allowed_pipelines is None
    with pytest.raises(ValueError, match="No allowed pipelines to search"):
        automl.search()
def test_random_seed(X_y_binary):
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=5, random_seed=0, n_jobs=1)
    automl.search()

    automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective=Precision(), max_iterations=5, random_seed=0, n_jobs=1)
    automl_1.search()
    assert automl.rankings.equals(automl_1.rankings)
def test_binary_auto(X_y_binary):
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="Log Loss Binary", max_iterations=5, n_jobs=1)
    automl.search()

    best_pipeline = automl.best_pipeline
    assert best_pipeline._is_fitted
    y_pred = best_pipeline.predict(X)
    assert len(np.unique(y_pred.to_series())) == 2
Ejemplo n.º 19
0
def test_automl_time_series_regression_pickle_generated_pipeline(mock_fit, mock_score, X_y_regression):
    X, y = X_y_regression
    configuration = {"gap": 0, "max_delay": 0, 'delay_target': False, 'delay_features': True}
    a = AutoMLSearch(X_train=X, y_train=y, problem_type="time series regression", problem_configuration=configuration)
    a.search()

    for i, row in a.rankings.iterrows():
        assert a.get_pipeline(row['id']).__class__ == GeneratedPipelineTimeSeriesRegression
        assert pickle.loads(pickle.dumps(a.get_pipeline(row['id'])))
def test_automl_allowed_pipelines_init_allowed_both_specified(mock_fit, mock_score, dummy_regression_pipeline_class, X_y_regression, assert_allowed_pipelines_equal_helper):
    X, y = X_y_regression
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=[dummy_regression_pipeline_class], allowed_model_families=[ModelFamily.RANDOM_FOREST])
    mock_score.return_value = {automl.objective.name: 1.0}
    expected_pipelines = [dummy_regression_pipeline_class]
    assert_allowed_pipelines_equal_helper(automl.allowed_pipelines, expected_pipelines)
    assert set(automl.allowed_model_families) == set([p.model_family for p in expected_pipelines])
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
def test_max_iterations(X_y_binary):
    X, y = X_y_binary
    max_iterations = 5
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=max_iterations,
                          n_jobs=1)
    automl.search()
    assert len(automl.full_rankings) == max_iterations
Ejemplo n.º 22
0
def test_automl_allowed_pipelines_no_allowed_pipelines(X_y_regression):
    X, y = X_y_regression
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          allowed_pipelines=None,
                          allowed_model_families=[])
    assert automl.allowed_pipelines is None
    with pytest.raises(ValueError, match="No allowed pipelines to search"):
        automl.search()
def test_categorical_classification(X_y_categorical_classification):
    X, y = X_y_categorical_classification
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective="precision",
                          max_iterations=5,
                          n_jobs=1)
    automl.search()
    assert not automl.rankings["mean_cv_score"].isnull().all()
def test_max_time(X_y_binary):
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_time=1e-16,
                          n_jobs=1)
    automl.search()
    # search will always run at least one pipeline
    assert len(automl.results['pipeline_results']) == 1
Ejemplo n.º 25
0
def test_categorical_regression(X_y_categorical_regression):
    X, y = X_y_categorical_regression
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='regression',
                          objective="R2",
                          max_iterations=5,
                          random_state=0,
                          n_jobs=1)
    automl.search()
    assert not automl.rankings['score'].isnull().all()
def test_recall_object(X_y_binary):
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          objective=Recall(),
                          max_iterations=1,
                          n_jobs=1)
    automl.search()
    assert len(automl.full_rankings) > 0
    assert automl.objective.name == 'Recall'
def test_plot_iterations_ipython_mock(mock_ipython_display, X_y_binary):
    pytest.importorskip('IPython.display', reason='Skipping plotting test because ipywidgets not installed')
    pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed')
    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective="f1", max_iterations=3, n_jobs=1)
    automl.search()
    plot = automl.plot.search_iteration_plot(interactive_plot=True)
    assert isinstance(plot, SearchIterationPlot)
    assert isinstance(plot.data, AutoMLSearch)
    mock_ipython_display.assert_called_with(plot.best_score_by_iter_fig)
def test_non_optimizable_threshold(mock_fit, mock_score, X_y_binary):
    mock_score.return_value = {"AUC": 1.0}
    X, y = X_y_binary
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', objective='AUC', max_iterations=1)
    automl.search()
    mock_fit.assert_called()
    mock_score.assert_called()
    assert automl.best_pipeline.threshold is None
    assert automl.results['pipeline_results'][0]['cv_data'][0].get('binary_classification_threshold') is None
    assert automl.results['pipeline_results'][0]['cv_data'][1].get('binary_classification_threshold') is None
    assert automl.results['pipeline_results'][0]['cv_data'][2].get('binary_classification_threshold') is None
Ejemplo n.º 29
0
def test_random_seed(X_y_regression):
    X, y = X_y_regression
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0,
                          n_jobs=1)
    automl.search()

    automl_1 = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', objective="R2", max_iterations=5, random_seed=0,
                            n_jobs=1)
    automl_1.search()

    # need to use assert_frame_equal as R2 could be different at the 10+ decimal
    assert pd.testing.assert_frame_equal(automl.rankings, automl_1.rankings) is None
Ejemplo n.º 30
0
def test_automl_regression_nonlinear_pipeline_search(nonlinear_regression_pipeline_class, X_y_regression):
    X, y = X_y_regression

    allowed_pipelines = [nonlinear_regression_pipeline_class]
    start_iteration_callback = MagicMock()
    automl = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', max_iterations=2, start_iteration_callback=start_iteration_callback,
                          allowed_pipelines=allowed_pipelines, n_jobs=1)
    automl.search()

    assert start_iteration_callback.call_count == 2
    assert start_iteration_callback.call_args_list[0][0][0] == MeanBaselineRegressionPipeline
    assert start_iteration_callback.call_args_list[1][0][0] == nonlinear_regression_pipeline_class